feat: Sprint 1 — IPA hardening, regression framework, ground-truth review

Track A (Backend): - Compound word IPA decomposition (schoolbag→school+bag) - Trailing garbled IPA fragment removal after brackets (R21 fix) - Regression runner with DB persistence, history endpoints - Page crop determinism verified with tests Track B (Frontend): - OCR Regression dashboard (/ai/ocr-regression) - Ground Truth Review workflow (/ai/ocr-ground-truth) with split-view, confidence highlighting, inline edit, batch mark, progress tracking Track C (Docs): - OCR-Pipeline.md v5.0 (Steps 5e-5h) - Regression testing guide - mkdocs.yml nav update Track D (Infra): - TrOCR baseline benchmark script - run-regression.sh shell script - Migration 008: regression_runs table Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-23 09:21:27 +01:00
parent f5d5d6c59c
commit a1e079b911
13 changed files with 1796 additions and 15 deletions
@@ -57,6 +57,63 @@ class TestInsertMissingIpa:
        result = _insert_missing_ipa("Anstecknadel", "british")
        assert result == "Anstecknadel"

+    def test_compound_word_schoolbag_gets_ipa(self):
+        """R07: Compound word 'schoolbag' should get decomposed IPA (school+bag)."""
+        from cv_ocr_engines import _insert_missing_ipa
+        result = _insert_missing_ipa("schoolbag", "british")
+        assert "[" in result and "]" in result
+        assert result.startswith("schoolbag [")
+
+    def test_compound_word_blackbird(self):
+        """Compound word 'blackbird' should get decomposed IPA."""
+        from cv_ocr_engines import _insert_missing_ipa
+        result = _insert_missing_ipa("blackbird", "british")
+        assert "[" in result and "]" in result
+
+    def test_compound_word_too_short(self):
+        """Words shorter than 6 chars should not attempt compound decomposition."""
+        from cv_ocr_engines import _decompose_compound
+        assert _decompose_compound("bag", "british") is None
+
+    def test_decompose_compound_direct(self):
+        """Direct test of _decompose_compound for known compounds."""
+        from cv_ocr_engines import _decompose_compound
+        # schoolbag = school + bag — both should be in dictionary
+        result = _decompose_compound("schoolbag", "british")
+        assert result is not None
+
+
+class TestStripPostBracketGarbled:
+    """Tests for _strip_post_bracket_garbled — trailing garbled IPA removal."""
+
+    def test_simple_trailing_garbled(self):
+        """R21-simple: 'sea [sˈiː] si:' → trailing IPA marker removed."""
+        from cv_ocr_engines import _strip_post_bracket_garbled
+        result = _strip_post_bracket_garbled("sea [sˈiː] si:")
+        assert "si:" not in result
+        assert result.startswith("sea [sˈiː]")
+
+    def test_multi_word_trailing_garbled(self):
+        """R21: 'seat [sˈiːt] belt si:t belt' → keep 'belt', remove garbled."""
+        from cv_ocr_engines import _strip_post_bracket_garbled
+        result = _strip_post_bracket_garbled("seat [sˈiːt] belt si:t belt")
+        assert "belt" in result  # real word kept
+        assert "si:t" not in result  # garbled removed
+        # Should contain "seat [sˈiːt] belt" but not the garbled duplication
+        assert result.count("belt") == 1
+
+    def test_delimiter_after_bracket_kept(self):
+        """Delimiters after IPA bracket are kept."""
+        from cv_ocr_engines import _strip_post_bracket_garbled
+        result = _strip_post_bracket_garbled("dance [dˈɑːns] – tanzen")
+        assert "– tanzen" in result
+
+    def test_german_after_bracket_kept(self):
+        """German words (uppercase) after IPA bracket are kept."""
+        from cv_ocr_engines import _strip_post_bracket_garbled
+        result = _strip_post_bracket_garbled("badge [bædʒ] Abzeichen")
+        assert "Abzeichen" in result
+

 class TestFixCellPhonetics:
    """Tests for fix_cell_phonetics function."""