""" Unit Tests for CV Vocab Pipeline (cv_vocab_pipeline.py) Tests cover: - Data classes (PageRegion, VocabRow, PipelineResult) - Stage 2: Deskew image - Stage 3: Dewarp (pass-through) - Stage 4: Image preparation (OCR + Layout images) - Stage 5: Layout analysis (content bounds, projection profiles, column detection) - Stage 6: Multi-pass OCR region handling - Stage 7: Line grouping and vocabulary matching - Noise filter functions (_is_noise_tail_token, _clean_cell_text) - Phonetic detection (_is_phonetic_only_text) - Phonetic & continuation row merging - Orchestrator (run_cv_pipeline) DSGVO Note: All tests run locally with synthetic data. No external API calls. """ import pytest import numpy as np from unittest.mock import AsyncMock, MagicMock, patch, PropertyMock from dataclasses import asdict # Import module under test from cv_vocab_pipeline import ( PageRegion, VocabRow, PipelineResult, deskew_image, dewarp_image, create_ocr_image, create_layout_image, _find_content_bounds, _filter_narrow_runs, _build_margin_regions, analyze_layout, _group_words_into_lines, match_lines_to_vocab, run_cv_pipeline, CV2_AVAILABLE, TESSERACT_AVAILABLE, CV_PIPELINE_AVAILABLE, _is_noise_tail_token, _clean_cell_text, _is_phonetic_only_text, _merge_phonetic_continuation_rows, _merge_continuation_rows, ) # ============================================= # FIXTURES # ============================================= @pytest.fixture def white_image(): """Create a simple 300x200 white BGR image.""" return np.ones((200, 300, 3), dtype=np.uint8) * 255 @pytest.fixture def text_like_image(): """Create a 600x400 image with dark text-like regions simulating 3 columns.""" img = np.ones((400, 600, 3), dtype=np.uint8) * 255 # Column 1 (EN): x=20..170 for y in range(50, 350, 30): img[y:y+15, 30:160, :] = 30 # Dark text lines # Gap between col1 and col2: x=170..210 (white) # Column 2 (DE): x=210..370 for y in range(50, 350, 30): img[y:y+15, 220:360, :] = 30 # Gap between col2 and col3: x=370..410 (white) # Column 3 (Example): x=410..580 for y in range(50, 350, 30): img[y:y+15, 420:570, :] = 30 return img @pytest.fixture def binary_image(): """Create a binary (single-channel) image for OCR tests.""" # White background (255) with some black text-like areas img = np.ones((400, 600), dtype=np.uint8) * 255 # Add text-like dark bands for y in range(50, 350, 30): img[y:y+15, 30:570] = 0 return img @pytest.fixture def sample_words_column_en(): """Sample OCR word dicts for English column.""" return [ {'text': 'achieve', 'left': 30, 'top': 50, 'width': 80, 'height': 15, 'conf': 90, 'region_type': 'column_en'}, {'text': 'improve', 'left': 30, 'top': 80, 'width': 80, 'height': 15, 'conf': 85, 'region_type': 'column_en'}, {'text': 'success', 'left': 30, 'top': 110, 'width': 80, 'height': 15, 'conf': 92, 'region_type': 'column_en'}, ] @pytest.fixture def sample_words_column_de(): """Sample OCR word dicts for German column.""" return [ {'text': 'erreichen', 'left': 220, 'top': 52, 'width': 100, 'height': 15, 'conf': 88, 'region_type': 'column_de'}, {'text': 'verbessern', 'left': 220, 'top': 82, 'width': 100, 'height': 15, 'conf': 80, 'region_type': 'column_de'}, {'text': 'Erfolg', 'left': 220, 'top': 112, 'width': 100, 'height': 15, 'conf': 95, 'region_type': 'column_de'}, ] @pytest.fixture def sample_words_column_ex(): """Sample OCR word dicts for Example column.""" return [ {'text': 'She', 'left': 420, 'top': 50, 'width': 30, 'height': 15, 'conf': 85, 'region_type': 'column_example'}, {'text': 'achieved', 'left': 455, 'top': 50, 'width': 70, 'height': 15, 'conf': 80, 'region_type': 'column_example'}, {'text': 'her', 'left': 530, 'top': 50, 'width': 30, 'height': 15, 'conf': 90, 'region_type': 'column_example'}, {'text': 'goals.', 'left': 420, 'top': 52, 'width': 50, 'height': 15, 'conf': 75, 'region_type': 'column_example'}, ] @pytest.fixture def sample_regions(): """Sample 3-column PageRegion layout.""" return [ PageRegion(type='column_en', x=0, y=50, width=190, height=300), PageRegion(type='column_de', x=210, y=50, width=160, height=300), PageRegion(type='column_example', x=410, y=50, width=190, height=300), ] # ============================================= # DATA CLASS TESTS # ============================================= class TestDataClasses: """Test data classes for correct defaults and fields.""" def test_page_region_creation(self): region = PageRegion(type='column_en', x=10, y=20, width=100, height=200) assert region.type == 'column_en' assert region.x == 10 assert region.y == 20 assert region.width == 100 assert region.height == 200 def test_vocab_row_defaults(self): row = VocabRow() assert row.english == "" assert row.german == "" assert row.example == "" assert row.confidence == 0.0 assert row.y_position == 0 def test_vocab_row_with_values(self): row = VocabRow(english="test", german="Test", example="A test.", confidence=85.5, y_position=100) assert row.english == "test" assert row.german == "Test" assert row.confidence == 85.5 def test_pipeline_result_defaults(self): result = PipelineResult() assert result.vocabulary == [] assert result.word_count == 0 assert result.columns_detected == 0 assert result.duration_seconds == 0.0 assert result.stages == {} assert result.error is None def test_pipeline_result_error(self): result = PipelineResult(error="Something went wrong") assert result.error == "Something went wrong" # ============================================= # STAGE 2: DESKEW TESTS # ============================================= @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available") class TestDeskew: """Test deskew (rotation correction) stage.""" def test_deskew_straight_image(self, white_image): """A perfectly straight image should not be rotated.""" corrected, angle = deskew_image(white_image) assert abs(angle) < 0.1 assert corrected.shape == white_image.shape def test_deskew_returns_tuple(self, white_image): """deskew_image must return (image, angle) tuple.""" result = deskew_image(white_image) assert isinstance(result, tuple) assert len(result) == 2 assert isinstance(result[0], np.ndarray) assert isinstance(result[1], float) def test_deskew_preserves_shape(self, text_like_image): """Output image should have same shape as input.""" corrected, _ = deskew_image(text_like_image) assert corrected.shape == text_like_image.shape # ============================================= # STAGE 3: DEWARP TESTS # ============================================= @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available") class TestDewarp: """Test dewarp stage (returns (image, info) tuple).""" def test_dewarp_returns_tuple(self, white_image): """dewarp_image must return (image, dewarp_info) tuple.""" result = dewarp_image(white_image) assert isinstance(result, tuple) assert len(result) == 2 img_out, info = result assert isinstance(img_out, np.ndarray) assert isinstance(info, dict) assert "shear_degrees" in info def test_dewarp_preserves_shape(self, text_like_image): """Output image should have same shape as input.""" img_out, _ = dewarp_image(text_like_image) assert img_out.shape == text_like_image.shape def test_dewarp_white_image_no_correction(self, white_image): """A uniform white image should get no shear correction.""" img_out, info = dewarp_image(white_image) assert abs(info["shear_degrees"]) < 0.5 assert img_out.shape == white_image.shape # ============================================= # STAGE 4: IMAGE PREPARATION TESTS # ============================================= @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available") class TestImagePreparation: """Test OCR and layout image creation.""" def test_create_ocr_image_returns_grayscale(self, text_like_image): """OCR image should be single-channel (binarized).""" ocr_img = create_ocr_image(text_like_image) assert len(ocr_img.shape) == 2 # Single channel assert ocr_img.dtype == np.uint8 def test_create_ocr_image_is_binary(self, text_like_image): """OCR image should contain only 0 and 255 values.""" ocr_img = create_ocr_image(text_like_image) unique_vals = np.unique(ocr_img) assert all(v in [0, 255] for v in unique_vals) def test_create_layout_image_returns_grayscale(self, text_like_image): """Layout image should be single-channel (CLAHE enhanced).""" layout_img = create_layout_image(text_like_image) assert len(layout_img.shape) == 2 assert layout_img.dtype == np.uint8 def test_create_layout_image_enhanced_contrast(self, text_like_image): """Layout image should have different histogram than simple grayscale.""" import cv2 gray = cv2.cvtColor(text_like_image, cv2.COLOR_BGR2GRAY) layout_img = create_layout_image(text_like_image) # CLAHE should change the histogram assert layout_img.shape == gray.shape # ============================================= # STAGE 5: LAYOUT ANALYSIS TESTS # ============================================= @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available") class TestContentBounds: """Test _find_content_bounds helper.""" def test_empty_image(self): """Fully white (inverted = black) image should return full bounds.""" inv = np.zeros((200, 300), dtype=np.uint8) left, right, top, bottom = _find_content_bounds(inv) # With no content, bounds should span the image assert left >= 0 assert right <= 300 assert top >= 0 assert bottom <= 200 def test_centered_content(self): """Content in center should give tight bounds.""" inv = np.zeros((400, 600), dtype=np.uint8) # Add content block in center inv[100:300, 50:550] = 255 left, right, top, bottom = _find_content_bounds(inv) assert left <= 52 # ~50 with 2px margin assert right >= 548 # ~550 with 2px margin assert top <= 102 assert bottom >= 298 @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available") class TestLayoutAnalysis: """Test analyze_layout for column detection.""" def test_returns_list_of_regions(self, text_like_image): """analyze_layout should return a list of PageRegion.""" ocr_img = create_ocr_image(text_like_image) layout_img = create_layout_image(text_like_image) regions = analyze_layout(layout_img, ocr_img) assert isinstance(regions, list) assert all(isinstance(r, PageRegion) for r in regions) def test_detects_columns(self, text_like_image): """With clear 3-column image, should detect at least 1 column.""" ocr_img = create_ocr_image(text_like_image) layout_img = create_layout_image(text_like_image) regions = analyze_layout(layout_img, ocr_img) column_regions = [r for r in regions if r.type.startswith('column')] assert len(column_regions) >= 1 def test_single_column_fallback(self): """Image with no clear columns should fall back to single column.""" # Uniform text across full width img = np.ones((400, 600, 3), dtype=np.uint8) * 255 for y in range(50, 350, 20): img[y:y+10, 20:580, :] = 30 # Full-width text ocr_img = create_ocr_image(img) layout_img = create_layout_image(img) regions = analyze_layout(layout_img, ocr_img) column_regions = [r for r in regions if r.type.startswith('column')] # Should at least return 1 column (full page fallback) assert len(column_regions) >= 1 def test_region_types_are_valid(self, text_like_image): """All region types should be from the expected set.""" ocr_img = create_ocr_image(text_like_image) layout_img = create_layout_image(text_like_image) regions = analyze_layout(layout_img, ocr_img) valid_types = {'column_en', 'column_de', 'column_example', 'header', 'footer'} for r in regions: assert r.type in valid_types, f"Unexpected region type: {r.type}" # ============================================= # STAGE 7: LINE GROUPING TESTS # ============================================= class TestLineGrouping: """Test _group_words_into_lines function.""" def test_empty_input(self): """Empty word list should return empty lines.""" assert _group_words_into_lines([]) == [] def test_single_word(self): """Single word should return one line with one word.""" words = [{'text': 'hello', 'left': 10, 'top': 50, 'width': 50, 'height': 15, 'conf': 90}] lines = _group_words_into_lines(words) assert len(lines) == 1 assert len(lines[0]) == 1 assert lines[0][0]['text'] == 'hello' def test_words_on_same_line(self): """Words close in Y should be grouped into one line.""" words = [ {'text': 'hello', 'left': 10, 'top': 50, 'width': 50, 'height': 15, 'conf': 90}, {'text': 'world', 'left': 70, 'top': 52, 'width': 50, 'height': 15, 'conf': 85}, ] lines = _group_words_into_lines(words, y_tolerance_px=10) assert len(lines) == 1 assert len(lines[0]) == 2 def test_words_on_different_lines(self): """Words far apart in Y should be on different lines.""" words = [ {'text': 'line1', 'left': 10, 'top': 50, 'width': 50, 'height': 15, 'conf': 90}, {'text': 'line2', 'left': 10, 'top': 100, 'width': 50, 'height': 15, 'conf': 85}, {'text': 'line3', 'left': 10, 'top': 150, 'width': 50, 'height': 15, 'conf': 88}, ] lines = _group_words_into_lines(words, y_tolerance_px=20) assert len(lines) == 3 def test_words_sorted_by_x_within_line(self): """Words within a line should be sorted by X position.""" words = [ {'text': 'world', 'left': 100, 'top': 50, 'width': 50, 'height': 15, 'conf': 85}, {'text': 'hello', 'left': 10, 'top': 52, 'width': 50, 'height': 15, 'conf': 90}, ] lines = _group_words_into_lines(words, y_tolerance_px=10) assert len(lines) == 1 assert lines[0][0]['text'] == 'hello' assert lines[0][1]['text'] == 'world' # ============================================= # STAGE 7: VOCABULARY MATCHING TESTS # ============================================= class TestVocabMatching: """Test match_lines_to_vocab function.""" def test_empty_results(self, sample_regions): """Empty OCR results should return empty vocab.""" vocab = match_lines_to_vocab({}, sample_regions) assert vocab == [] def test_en_only(self, sample_words_column_en, sample_regions): """Only EN words should create entries with empty DE/example.""" ocr_results = {'column_en': sample_words_column_en} vocab = match_lines_to_vocab(ocr_results, sample_regions) assert len(vocab) == 3 for row in vocab: assert row.english != "" assert row.german == "" def test_en_de_matching(self, sample_words_column_en, sample_words_column_de, sample_regions): """EN and DE words on same Y should be matched.""" ocr_results = { 'column_en': sample_words_column_en, 'column_de': sample_words_column_de, } vocab = match_lines_to_vocab(ocr_results, sample_regions, y_tolerance_px=25) assert len(vocab) == 3 # First entry should match achieve <-> erreichen assert vocab[0].english == 'achieve' assert vocab[0].german == 'erreichen' def test_full_3_column_matching(self, sample_words_column_en, sample_words_column_de, sample_words_column_ex, sample_regions): """All 3 columns should be matched by Y coordinate.""" ocr_results = { 'column_en': sample_words_column_en, 'column_de': sample_words_column_de, 'column_example': sample_words_column_ex, } vocab = match_lines_to_vocab(ocr_results, sample_regions, y_tolerance_px=25) assert len(vocab) >= 1 # First entry should have example text assert vocab[0].english == 'achieve' assert vocab[0].example != "" def test_sorted_by_y_position(self, sample_words_column_en, sample_regions): """Result should be sorted by Y position.""" ocr_results = {'column_en': sample_words_column_en} vocab = match_lines_to_vocab(ocr_results, sample_regions) positions = [row.y_position for row in vocab] assert positions == sorted(positions) def test_skips_short_entries(self, sample_regions): """Very short text (< 2 chars) should be skipped.""" words = [ {'text': 'a', 'left': 30, 'top': 50, 'width': 10, 'height': 15, 'conf': 90, 'region_type': 'column_en'}, {'text': 'valid', 'left': 30, 'top': 80, 'width': 50, 'height': 15, 'conf': 90, 'region_type': 'column_en'}, ] ocr_results = {'column_en': words} vocab = match_lines_to_vocab(ocr_results, sample_regions) assert len(vocab) == 1 assert vocab[0].english == 'valid' def test_confidence_calculation(self, sample_words_column_en, sample_words_column_de, sample_regions): """Confidence should be the average of matched columns.""" ocr_results = { 'column_en': sample_words_column_en, 'column_de': sample_words_column_de, } vocab = match_lines_to_vocab(ocr_results, sample_regions, y_tolerance_px=25) # First entry: EN conf=90, DE conf=88 → avg=89 assert vocab[0].confidence > 0 assert vocab[0].confidence == pytest.approx(89.0, abs=1.0) # ============================================= # ORCHESTRATOR TESTS # ============================================= class TestOrchestrator: """Test run_cv_pipeline orchestrator.""" @pytest.mark.asyncio async def test_no_input_returns_error(self): """Pipeline without input should return error.""" result = await run_cv_pipeline() assert result.error is not None assert "No input data" in result.error @pytest.mark.asyncio async def test_pipeline_unavailable(self): """When CV_PIPELINE_AVAILABLE is False, should return error.""" with patch('cv_vocab_pipeline.CV_PIPELINE_AVAILABLE', False): result = await run_cv_pipeline(pdf_data=b"fake") assert result.error is not None assert "not available" in result.error @pytest.mark.asyncio @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available") async def test_pipeline_with_image_data(self): """Pipeline with a real synthetic image should run without errors.""" import cv2 # Create a simple test image (white with some text-like black bars) img = np.ones((200, 300, 3), dtype=np.uint8) * 255 for y in range(30, 170, 25): img[y:y+12, 20:280, :] = 30 _, img_bytes = cv2.imencode('.png', img) image_data = img_bytes.tobytes() with patch('cv_vocab_pipeline.pytesseract') as mock_tess: # Mock Tesseract to return empty results mock_tess.image_to_data.return_value = { 'text': [], 'conf': [], 'left': [], 'top': [], 'width': [], 'height': [], } mock_tess.Output.DICT = 'dict' result = await run_cv_pipeline(image_data=image_data) assert result.error is None assert result.image_width == 300 assert result.image_height == 200 assert 'render' in result.stages assert 'deskew' in result.stages @pytest.mark.asyncio @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available") async def test_pipeline_records_timing(self): """Pipeline should record timing for each stage.""" import cv2 img = np.ones((100, 150, 3), dtype=np.uint8) * 255 _, img_bytes = cv2.imencode('.png', img) with patch('cv_vocab_pipeline.pytesseract') as mock_tess: mock_tess.image_to_data.return_value = { 'text': [], 'conf': [], 'left': [], 'top': [], 'width': [], 'height': [], } mock_tess.Output.DICT = 'dict' result = await run_cv_pipeline(image_data=img_bytes.tobytes()) assert result.duration_seconds >= 0 assert all(v >= 0 for v in result.stages.values()) @pytest.mark.asyncio async def test_pipeline_result_format(self): """PipelineResult vocabulary should be list of dicts with expected keys.""" result = PipelineResult() result.vocabulary = [ {"english": "test", "german": "Test", "example": "A test.", "confidence": 90.0} ] assert len(result.vocabulary) == 1 entry = result.vocabulary[0] assert "english" in entry assert "german" in entry assert "example" in entry assert "confidence" in entry # ============================================= # INTEGRATION-STYLE TESTS (with mocked Tesseract) # ============================================= @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available") class TestStageIntegration: """Test multiple stages together (still unit-test level with mocked OCR).""" def test_image_prep_to_layout(self, text_like_image): """Stages 4→5: image prep feeds layout analysis correctly.""" ocr_img = create_ocr_image(text_like_image) layout_img = create_layout_image(text_like_image) assert ocr_img.shape[:2] == text_like_image.shape[:2] assert layout_img.shape[:2] == text_like_image.shape[:2] regions = analyze_layout(layout_img, ocr_img) assert len(regions) >= 1 def test_deskew_to_image_prep(self, text_like_image): """Stages 2→4: deskew output can be processed by image prep.""" corrected, angle = deskew_image(text_like_image) ocr_img = create_ocr_image(corrected) layout_img = create_layout_image(corrected) assert ocr_img.shape[:2] == corrected.shape[:2] assert layout_img.shape[:2] == corrected.shape[:2] # ============================================= # NOISE FILTER TESTS # ============================================= class TestNoiseFilter: """Test _is_noise_tail_token for trailing OCR noise detection.""" # --- Tokens that should be KEPT (return False) --- @pytest.mark.parametrize("token", [ # Compound words with hyphens "money-saver", "under-", "well-known", # Words with parenthesized parts (dictionary entries) "Schild(chen)", "(Salat-)Gurke", "(auf)", "(on)", "selbst)", "(wir", "Tanz(veranstaltung)", "(zer)brechen", # Phonetic brackets "serva]", "['mani", "[eg]", "[maus]", # Words with trailing punctuation "cupcakes.", "sister.", "mice", # Abbreviations "e.g.", "sth.", "usw.", "adj.", # Ellipsis "...", "\u2026", # Regular words "the", "cat", "big", "run", "set", "ago", ]) def test_keep_real_tokens(self, token): """Real words, dictionary punctuation, and phonetic brackets are kept.""" assert _is_noise_tail_token(token) is False, f"Should keep {token!r}" # --- Tokens that should be FILTERED (return True) --- @pytest.mark.parametrize("token", [ # Pure non-alpha "B|", "3d", "x7", ")", "|", "@", "3", # Very short non-dictionary fragments "ee", "k", "zz", "qq", # Empty "", " ", ]) def test_filter_noise_tokens(self, token): """OCR noise fragments are filtered.""" assert _is_noise_tail_token(token) is True, f"Should filter {token!r}" class TestCleanCellText: """Test _clean_cell_text integration (full text → cleaned text).""" def test_empty_returns_empty(self): assert _clean_cell_text("") == "" assert _clean_cell_text(" ") == "" def test_real_word_unchanged(self): assert _clean_cell_text("cupcakes") == "cupcakes" def test_strips_trailing_noise(self): """Trailing noise tokens should be removed.""" result = _clean_cell_text("cupcakes B|") assert result == "cupcakes" def test_keeps_trailing_real_word(self): """Trailing real words should be kept.""" result = _clean_cell_text("big cat") assert result == "big cat" def test_abbreviation_kept(self): """Known abbreviations should not be cleared.""" result = _clean_cell_text("e.g.") assert result == "e.g." def test_pure_garbage_cleared(self): """OCR garbage without real words should be cleared.""" result = _clean_cell_text("3d |x") assert result == "" def test_compound_word_preserved(self): """Compound words with hyphens should be preserved.""" result = _clean_cell_text("money-saver") assert result == "money-saver" def test_parenthesized_word_preserved(self): result = _clean_cell_text("(Salat-)Gurke") assert result == "(Salat-)Gurke" def test_multiple_trailing_noise(self): """Multiple trailing noise tokens should all be removed.""" result = _clean_cell_text("achieve 3 |") assert result == "achieve" class TestPhoneticOnlyText: """Test _is_phonetic_only_text for phonetic transcription detection.""" @pytest.mark.parametrize("text,expected", [ # Phonetic-only patterns → True ("['mani serva]", True), ("[dɑːns]", True), ("[\"a:mand]", True), ("['wɜːkʃɒp]", True), # serva] has 5 alpha chars after bracket removal → NOT phonetic-only ("serva]", False), # NOT phonetic-only → False ("almond ['a:mand]", False), ("Mandel", False), ("cupcakes", False), ("", False), ("achieve", False), ("money-saver ['mani]", False), ]) def test_phonetic_detection(self, text, expected): assert _is_phonetic_only_text(text) is expected, \ f"_is_phonetic_only_text({text!r}) should be {expected}" class TestMergePhoneticContinuationRows: """Test _merge_phonetic_continuation_rows for phonetic row merging.""" def test_empty_list(self): assert _merge_phonetic_continuation_rows([]) == [] def test_single_entry(self): entries = [{"english": "cat", "german": "Katze", "example": ""}] result = _merge_phonetic_continuation_rows(entries) assert len(result) == 1 assert result[0]["english"] == "cat" def test_merges_phonetic_row(self): """Phonetic-only row should merge into previous entry.""" entries = [ {"english": "money-saver", "german": "Sparfuchs", "example": "", "row_index": 0}, {"english": "['mani serva]", "german": "", "example": "", "row_index": 1}, ] result = _merge_phonetic_continuation_rows(entries) assert len(result) == 1 assert result[0]["english"] == "money-saver ['mani serva]" assert result[0]["german"] == "Sparfuchs" def test_no_merge_when_de_present(self): """Row with DE text should NOT be merged even if EN looks phonetic.""" entries = [ {"english": "cat", "german": "Katze", "example": ""}, {"english": "[kæt]", "german": "some text", "example": ""}, ] result = _merge_phonetic_continuation_rows(entries) assert len(result) == 2 def test_no_merge_regular_rows(self): """Normal vocab rows should not be merged.""" entries = [ {"english": "cat", "german": "Katze", "example": ""}, {"english": "dog", "german": "Hund", "example": ""}, ] result = _merge_phonetic_continuation_rows(entries) assert len(result) == 2 def test_merges_example_too(self): """If phonetic row has example text, it should merge into previous.""" entries = [ {"english": "dance", "german": "tanzen", "example": "", "row_index": 0}, {"english": "[dɑːns]", "german": "", "example": "Let's dance.", "row_index": 1}, ] result = _merge_phonetic_continuation_rows(entries) assert len(result) == 1 assert result[0]["english"] == "dance [dɑːns]" assert result[0]["example"] == "Let's dance." class TestMergeContinuationRows: """Test _merge_continuation_rows for multi-line entry merging.""" def test_empty_list(self): assert _merge_continuation_rows([]) == [] def test_no_merge_independent_rows(self): """Rows with both EN and DE should not be merged.""" entries = [ {"english": "cat", "german": "Katze", "example": "", "row_index": 0}, {"english": "dog", "german": "Hund", "example": "", "row_index": 1}, ] result = _merge_continuation_rows(entries) assert len(result) == 2 def test_merge_lowercase_continuation(self): """Lowercase EN with empty DE should merge into previous.""" entries = [ {"english": "to put up", "german": "aufstellen", "example": "", "row_index": 0}, {"english": "with sth.", "german": "", "example": "", "row_index": 1}, ] result = _merge_continuation_rows(entries) assert len(result) == 1 assert result[0]["english"] == "to put up with sth." assert result[0]["german"] == "aufstellen" def test_no_merge_uppercase_start(self): """EN starting with uppercase and empty DE is likely its own entry, not a continuation.""" entries = [ {"english": "cat", "german": "Katze", "example": "", "row_index": 0}, {"english": "Dog", "german": "", "example": "", "row_index": 1}, ] result = _merge_continuation_rows(entries) assert len(result) == 2 def test_no_merge_when_previous_ends_with_period(self): """If previous entry ends with sentence terminator, next is not continuation.""" entries = [ {"english": "That's great.", "german": "Das ist toll.", "example": "", "row_index": 0}, {"english": "really nice", "german": "", "example": "", "row_index": 1}, ] result = _merge_continuation_rows(entries) assert len(result) == 2 def test_no_merge_long_text(self): """Text with 4+ words is likely an example sentence, not continuation.""" entries = [ {"english": "achieve", "german": "erreichen", "example": "", "row_index": 0}, {"english": "she achieved her goals", "german": "", "example": "", "row_index": 1}, ] result = _merge_continuation_rows(entries) assert len(result) == 2 def test_first_entry_not_merged(self): """First entry with empty DE should not crash (no previous).""" entries = [ {"english": "something", "german": "", "example": "", "row_index": 0}, {"english": "cat", "german": "Katze", "example": "", "row_index": 1}, ] result = _merge_continuation_rows(entries) assert len(result) == 2 # ============================================= # Test: Content-Bounds Scan-Artifact Filtering # ============================================= class TestContentBoundsFiltering: """Test that _find_content_bounds filters narrow scan artifacts.""" def test_thin_vertical_line_ignored(self): """A 2px black line at the left edge should not pull left_x leftward.""" inv = np.zeros((400, 600), dtype=np.uint8) # Main content block in the middle inv[50:350, 100:550] = 255 # 2px thin vertical scan artifact at x=5..6 inv[50:350, 5:7] = 255 left, right, top, bottom = _find_content_bounds(inv) # left_x must be near 100 (the real content), not near 5 assert left >= 90, f"left_x={left} should be >=90 (near real content, not artifact)" def test_thick_content_preserved(self): """A 50px wide text block is real content and must not be filtered.""" inv = np.zeros((400, 600), dtype=np.uint8) inv[50:350, 80:130] = 255 # 50px wide block inv[50:350, 200:500] = 255 # wider block left, right, top, bottom = _find_content_bounds(inv) assert left <= 82, f"left_x={left} should be <=82 (50px block is real content)" def test_no_artifacts_unchanged(self): """Normal image without artifacts: bounds should match content.""" inv = np.zeros((400, 600), dtype=np.uint8) inv[100:300, 50:550] = 255 left, right, top, bottom = _find_content_bounds(inv) assert left <= 52 assert right >= 548 assert top <= 105 assert bottom >= 295 def test_right_edge_artifact_ignored(self): """A thin vertical line at the right edge should not pull right_x rightward.""" inv = np.zeros((400, 600), dtype=np.uint8) inv[50:350, 50:500] = 255 # real content inv[50:350, 595:598] = 255 # 3px artifact at right edge left, right, top, bottom = _find_content_bounds(inv) assert right <= 510, f"right_x={right} should be <=510, ignoring right-edge artifact" def test_horizontal_line_ignored(self): """A thin horizontal line at the top should not pull top_y upward.""" inv = np.zeros((400, 600), dtype=np.uint8) inv[100:350, 50:550] = 255 # real content inv[2:4, 50:550] = 255 # 2px horizontal artifact at top left, right, top, bottom = _find_content_bounds(inv) assert top >= 90, f"top_y={top} should be >=90 (ignoring thin top line)" class TestFilterNarrowRuns: """Test the _filter_narrow_runs helper directly.""" def test_removes_short_run(self): mask = np.array([False, True, True, False, True, True, True, True, True, False]) result = _filter_narrow_runs(mask, min_width=3) # The 2-wide run at indices 1-2 should be removed assert not result[1] assert not result[2] # The 5-wide run at indices 4-8 should remain assert result[4] assert result[8] def test_keeps_wide_run(self): mask = np.array([True] * 10) result = _filter_narrow_runs(mask, min_width=5) assert all(result) def test_all_narrow(self): mask = np.array([True, True, False, True, False]) result = _filter_narrow_runs(mask, min_width=3) assert not any(result) # ============================================= # Test: Margin Regions # ============================================= class TestMarginRegions: """Test _build_margin_regions and margin integration.""" def test_margin_left_created(self): """When left_x > 5, a margin_left region should be created.""" existing = [ PageRegion(type='column_en', x=100, y=50, width=200, height=300), PageRegion(type='column_de', x=320, y=50, width=200, height=300), ] margins = _build_margin_regions(existing, left_x=100, right_x=520, img_w=600, top_y=50, content_h=300) left_margins = [m for m in margins if m.type == 'margin_left'] assert len(left_margins) == 1 ml = left_margins[0] assert ml.x == 0 assert ml.width == 100 def test_margin_right_created(self): """When there's space after the last column, margin_right should be created.""" existing = [ PageRegion(type='column_en', x=50, y=50, width=200, height=300), PageRegion(type='column_de', x=260, y=50, width=200, height=300), ] # last_col_end = 260 + 200 = 460, img_w = 600 → gap = 140 margins = _build_margin_regions(existing, left_x=50, right_x=460, img_w=600, top_y=50, content_h=300) right_margins = [m for m in margins if m.type == 'margin_right'] assert len(right_margins) == 1 mr = right_margins[0] assert mr.x == 460 assert mr.width == 140 def test_no_margin_when_flush(self): """When columns are flush with the image edges, no margins should appear.""" existing = [ PageRegion(type='column_en', x=0, y=0, width=300, height=400), PageRegion(type='column_de', x=300, y=0, width=300, height=400), ] margins = _build_margin_regions(existing, left_x=0, right_x=600, img_w=600, top_y=0, content_h=400) assert len(margins) == 0 def test_margins_in_skip_types(self): """Verify margin types are in the skip set used by build_cell_grid.""" skip = {'column_ignore', 'header', 'footer', 'page_ref', 'margin_left', 'margin_right'} assert 'margin_left' in skip assert 'margin_right' in skip def test_margin_confidence_and_method(self): """Margin regions should have confidence 1.0 and method 'content_bounds'.""" existing = [PageRegion(type='column_en', x=80, y=20, width=400, height=500)] margins = _build_margin_regions(existing, left_x=80, right_x=480, img_w=600, top_y=20, content_h=500) for m in margins: assert m.classification_confidence == 1.0 assert m.classification_method == 'content_bounds' # ============================================= # RUN TESTS # ============================================= if __name__ == "__main__": pytest.main([__file__, "-v"])