feat(ocr-pipeline): 6 systematic improvements for robustness, performance & UX

1. Unit tests: 76 new parametrized tests for noise filter, phonetic detection, cell text cleaning, and row merging (116 total, all green) 2. Continuation-row merge: detect multi-line vocab entries where text wraps (lowercase EN + empty DE) and merge into previous entry 3. Empty DE fallback: secondary PSM=7 OCR pass for cells missed by PSM=6 4. Batch-OCR: collect empty cells per column, run single Tesseract call on column strip instead of per-cell (~66% fewer calls for 3+ empty cells) 5. StepReconstruction UI: font scaling via naturalHeight, empty EN/DE field highlighting, undo/redo (Ctrl+Z), per-cell reset button 6. Session reprocess: POST /sessions/{id}/reprocess endpoint to re-run from any step, with reprocess button on completed pipeline steps Also fixes pre-existing dewarp_image tuple unpacking bug in run_cv_pipeline and updates dewarp tests to match current (image, info) return signature. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 14:46:38 +01:00
parent c3a924a620
commit e718353d9f
6 changed files with 775 additions and 79 deletions
@@ -9,6 +9,9 @@ Tests cover:
 - Stage 5: Layout analysis (content bounds, projection profiles, column detection)
 - Stage 6: Multi-pass OCR region handling
 - Stage 7: Line grouping and vocabulary matching
+- Noise filter functions (_is_noise_tail_token, _clean_cell_text)
+- Phonetic detection (_is_phonetic_only_text)
+- Phonetic & continuation row merging
 - Orchestrator (run_cv_pipeline)

 DSGVO Note: All tests run locally with synthetic data. No external API calls.
@@ -36,6 +39,11 @@ from cv_vocab_pipeline import (
    CV2_AVAILABLE,
    TESSERACT_AVAILABLE,
    CV_PIPELINE_AVAILABLE,
+    _is_noise_tail_token,
+    _clean_cell_text,
+    _is_phonetic_only_text,
+    _merge_phonetic_continuation_rows,
+    _merge_continuation_rows,
 )


@@ -202,16 +210,28 @@ class TestDeskew:

@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
 class TestDewarp:
-    """Test dewarp (pass-through) stage."""
+    """Test dewarp stage (returns (image, info) tuple)."""

-    def test_dewarp_passthrough(self, white_image):
-        """Current dewarp should return the same image (pass-through)."""
+    def test_dewarp_returns_tuple(self, white_image):
+        """dewarp_image must return (image, dewarp_info) tuple."""
        result = dewarp_image(white_image)
-        np.testing.assert_array_equal(result, white_image)
+        assert isinstance(result, tuple)
+        assert len(result) == 2
+        img_out, info = result
+        assert isinstance(img_out, np.ndarray)
+        assert isinstance(info, dict)
+        assert "shear_degrees" in info

    def test_dewarp_preserves_shape(self, text_like_image):
-        result = dewarp_image(text_like_image)
-        assert result.shape == text_like_image.shape
+        """Output image should have same shape as input."""
+        img_out, _ = dewarp_image(text_like_image)
+        assert img_out.shape == text_like_image.shape
+
+    def test_dewarp_white_image_no_correction(self, white_image):
+        """A uniform white image should get no shear correction."""
+        img_out, info = dewarp_image(white_image)
+        assert abs(info["shear_degrees"]) < 0.5
+        assert img_out.shape == white_image.shape


 # =============================================
@@ -561,6 +581,268 @@ class TestStageIntegration:
        assert layout_img.shape[:2] == corrected.shape[:2]


+# =============================================
+# NOISE FILTER TESTS
+# =============================================
+
+class TestNoiseFilter:
+    """Test _is_noise_tail_token for trailing OCR noise detection."""
+
+    # --- Tokens that should be KEPT (return False) ---
+
+    @pytest.mark.parametrize("token", [
+        # Compound words with hyphens
+        "money-saver",
+        "under-",
+        "well-known",
+        # Words with parenthesized parts (dictionary entries)
+        "Schild(chen)",
+        "(Salat-)Gurke",
+        "(auf)",
+        "(on)",
+        "selbst)",
+        "(wir",
+        "Tanz(veranstaltung)",
+        "(zer)brechen",
+        # Phonetic brackets
+        "serva]",
+        "['mani",
+        "[eg]",
+        "[maus]",
+        # Words with trailing punctuation
+        "cupcakes.",
+        "sister.",
+        "mice",
+        # Abbreviations
+        "e.g.",
+        "sth.",
+        "usw.",
+        "adj.",
+        # Ellipsis
+        "...",
+        "\u2026",
+        # Regular words
+        "the",
+        "cat",
+        "big",
+        "run",
+        "set",
+        "ago",
+    ])
+    def test_keep_real_tokens(self, token):
+        """Real words, dictionary punctuation, and phonetic brackets are kept."""
+        assert _is_noise_tail_token(token) is False, f"Should keep {token!r}"
+
+    # --- Tokens that should be FILTERED (return True) ---
+
+    @pytest.mark.parametrize("token", [
+        # Pure non-alpha
+        "B|",
+        "3d",
+        "x7",
+        ")",
+        "|",
+        "@",
+        "3",
+        # Very short non-dictionary fragments
+        "ee",
+        "k",
+        "zz",
+        "qq",
+        # Empty
+        "",
+        "  ",
+    ])
+    def test_filter_noise_tokens(self, token):
+        """OCR noise fragments are filtered."""
+        assert _is_noise_tail_token(token) is True, f"Should filter {token!r}"
+
+
+class TestCleanCellText:
+    """Test _clean_cell_text integration (full text → cleaned text)."""
+
+    def test_empty_returns_empty(self):
+        assert _clean_cell_text("") == ""
+        assert _clean_cell_text("   ") == ""
+
+    def test_real_word_unchanged(self):
+        assert _clean_cell_text("cupcakes") == "cupcakes"
+
+    def test_strips_trailing_noise(self):
+        """Trailing noise tokens should be removed."""
+        result = _clean_cell_text("cupcakes B|")
+        assert result == "cupcakes"
+
+    def test_keeps_trailing_real_word(self):
+        """Trailing real words should be kept."""
+        result = _clean_cell_text("big cat")
+        assert result == "big cat"
+
+    def test_abbreviation_kept(self):
+        """Known abbreviations should not be cleared."""
+        result = _clean_cell_text("e.g.")
+        assert result == "e.g."
+
+    def test_pure_garbage_cleared(self):
+        """OCR garbage without real words should be cleared."""
+        result = _clean_cell_text("3d |x")
+        assert result == ""
+
+    def test_compound_word_preserved(self):
+        """Compound words with hyphens should be preserved."""
+        result = _clean_cell_text("money-saver")
+        assert result == "money-saver"
+
+    def test_parenthesized_word_preserved(self):
+        result = _clean_cell_text("(Salat-)Gurke")
+        assert result == "(Salat-)Gurke"
+
+    def test_multiple_trailing_noise(self):
+        """Multiple trailing noise tokens should all be removed."""
+        result = _clean_cell_text("achieve 3 |")
+        assert result == "achieve"
+
+
+class TestPhoneticOnlyText:
+    """Test _is_phonetic_only_text for phonetic transcription detection."""
+
+    @pytest.mark.parametrize("text,expected", [
+        # Phonetic-only patterns → True
+        ("['mani serva]", True),
+        ("[dɑːns]", True),
+        ("[\"a:mand]", True),
+        ("['wɜːkʃɒp]", True),
+        # serva] has 5 alpha chars after bracket removal → NOT phonetic-only
+        ("serva]", False),
+        # NOT phonetic-only → False
+        ("almond ['a:mand]", False),
+        ("Mandel", False),
+        ("cupcakes", False),
+        ("", False),
+        ("achieve", False),
+        ("money-saver ['mani]", False),
+    ])
+    def test_phonetic_detection(self, text, expected):
+        assert _is_phonetic_only_text(text) is expected, \
+            f"_is_phonetic_only_text({text!r}) should be {expected}"
+
+
+class TestMergePhoneticContinuationRows:
+    """Test _merge_phonetic_continuation_rows for phonetic row merging."""
+
+    def test_empty_list(self):
+        assert _merge_phonetic_continuation_rows([]) == []
+
+    def test_single_entry(self):
+        entries = [{"english": "cat", "german": "Katze", "example": ""}]
+        result = _merge_phonetic_continuation_rows(entries)
+        assert len(result) == 1
+        assert result[0]["english"] == "cat"
+
+    def test_merges_phonetic_row(self):
+        """Phonetic-only row should merge into previous entry."""
+        entries = [
+            {"english": "money-saver", "german": "Sparfuchs", "example": "", "row_index": 0},
+            {"english": "['mani serva]", "german": "", "example": "", "row_index": 1},
+        ]
+        result = _merge_phonetic_continuation_rows(entries)
+        assert len(result) == 1
+        assert result[0]["english"] == "money-saver ['mani serva]"
+        assert result[0]["german"] == "Sparfuchs"
+
+    def test_no_merge_when_de_present(self):
+        """Row with DE text should NOT be merged even if EN looks phonetic."""
+        entries = [
+            {"english": "cat", "german": "Katze", "example": ""},
+            {"english": "[kæt]", "german": "some text", "example": ""},
+        ]
+        result = _merge_phonetic_continuation_rows(entries)
+        assert len(result) == 2
+
+    def test_no_merge_regular_rows(self):
+        """Normal vocab rows should not be merged."""
+        entries = [
+            {"english": "cat", "german": "Katze", "example": ""},
+            {"english": "dog", "german": "Hund", "example": ""},
+        ]
+        result = _merge_phonetic_continuation_rows(entries)
+        assert len(result) == 2
+
+    def test_merges_example_too(self):
+        """If phonetic row has example text, it should merge into previous."""
+        entries = [
+            {"english": "dance", "german": "tanzen", "example": "", "row_index": 0},
+            {"english": "[dɑːns]", "german": "", "example": "Let's dance.", "row_index": 1},
+        ]
+        result = _merge_phonetic_continuation_rows(entries)
+        assert len(result) == 1
+        assert result[0]["english"] == "dance [dɑːns]"
+        assert result[0]["example"] == "Let's dance."
+
+
+class TestMergeContinuationRows:
+    """Test _merge_continuation_rows for multi-line entry merging."""
+
+    def test_empty_list(self):
+        assert _merge_continuation_rows([]) == []
+
+    def test_no_merge_independent_rows(self):
+        """Rows with both EN and DE should not be merged."""
+        entries = [
+            {"english": "cat", "german": "Katze", "example": "", "row_index": 0},
+            {"english": "dog", "german": "Hund", "example": "", "row_index": 1},
+        ]
+        result = _merge_continuation_rows(entries)
+        assert len(result) == 2
+
+    def test_merge_lowercase_continuation(self):
+        """Lowercase EN with empty DE should merge into previous."""
+        entries = [
+            {"english": "to put up", "german": "aufstellen", "example": "", "row_index": 0},
+            {"english": "with sth.", "german": "", "example": "", "row_index": 1},
+        ]
+        result = _merge_continuation_rows(entries)
+        assert len(result) == 1
+        assert result[0]["english"] == "to put up with sth."
+        assert result[0]["german"] == "aufstellen"
+
+    def test_no_merge_uppercase_start(self):
+        """EN starting with uppercase and empty DE is likely its own entry, not a continuation."""
+        entries = [
+            {"english": "cat", "german": "Katze", "example": "", "row_index": 0},
+            {"english": "Dog", "german": "", "example": "", "row_index": 1},
+        ]
+        result = _merge_continuation_rows(entries)
+        assert len(result) == 2
+
+    def test_no_merge_when_previous_ends_with_period(self):
+        """If previous entry ends with sentence terminator, next is not continuation."""
+        entries = [
+            {"english": "That's great.", "german": "Das ist toll.", "example": "", "row_index": 0},
+            {"english": "really nice", "german": "", "example": "", "row_index": 1},
+        ]
+        result = _merge_continuation_rows(entries)
+        assert len(result) == 2
+
+    def test_no_merge_long_text(self):
+        """Text with 4+ words is likely an example sentence, not continuation."""
+        entries = [
+            {"english": "achieve", "german": "erreichen", "example": "", "row_index": 0},
+            {"english": "she achieved her goals", "german": "", "example": "", "row_index": 1},
+        ]
+        result = _merge_continuation_rows(entries)
+        assert len(result) == 2
+
+    def test_first_entry_not_merged(self):
+        """First entry with empty DE should not crash (no previous)."""
+        entries = [
+            {"english": "something", "german": "", "example": "", "row_index": 0},
+            {"english": "cat", "german": "Katze", "example": "", "row_index": 1},
+        ]
+        result = _merge_continuation_rows(entries)
+        assert len(result) == 2
+
+
 # =============================================
 # RUN TESTS
 # =============================================