feat: IPA-Lautschrift in Cell-Texte einfuegen (fuer Overlay-Modus)

fix_cell_phonetics() ersetzt fehlerhafte IPA-Klammern UND fuegt fehlende Lautschrift fuer englische Woerter ein (z.B. badge, film, challenge, profit). Wird auf alle Zellen mit col_type column_en/column_text angewandt. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 15:47:26 +01:00
parent 8a5f2aa188
commit 2f51ac617f
3 changed files with 231 additions and 2 deletions
@@ -0,0 +1,117 @@
+"""Tests for fix_cell_phonetics and _insert_missing_ipa."""
+
+import pytest
+from unittest.mock import patch, MagicMock
+
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+
+
+class TestInsertMissingIpa:
+    """Tests for _insert_missing_ipa function."""
+
+    def test_single_headword_gets_ipa(self):
+        """Single English headword should get IPA inserted."""
+        from cv_ocr_engines import _insert_missing_ipa
+        result = _insert_missing_ipa("badge", "british")
+        assert "[" in result and "]" in result
+        assert result.startswith("badge [")
+
+    def test_short_phrase_first_word_gets_ipa(self):
+        """First real word in short phrase gets IPA."""
+        from cv_ocr_engines import _insert_missing_ipa
+        result = _insert_missing_ipa("film", "british")
+        assert "[" in result
+
+    def test_long_sentence_unchanged(self):
+        """Sentences with >6 words should not get IPA."""
+        from cv_ocr_engines import _insert_missing_ipa
+        text = "Can I borrow your CD player from you please"
+        result = _insert_missing_ipa(text, "british")
+        assert result == text
+
+    def test_existing_brackets_unchanged(self):
+        """Text with existing brackets should not get double IPA."""
+        from cv_ocr_engines import _insert_missing_ipa
+        text = "dance [dˈɑːns]"
+        result = _insert_missing_ipa(text, "british")
+        assert result == text
+
+    def test_empty_text_unchanged(self):
+        """Empty text returns empty."""
+        from cv_ocr_engines import _insert_missing_ipa
+        assert _insert_missing_ipa("", "british") == ""
+        assert _insert_missing_ipa("  ", "british") == ""
+
+    def test_grammar_words_skipped(self):
+        """Grammar particles should not get IPA."""
+        from cv_ocr_engines import _insert_missing_ipa
+        # "sth" is in _GRAMMAR_BRACKET_WORDS
+        result = _insert_missing_ipa("sth", "british")
+        assert "[" not in result
+
+    def test_german_word_no_ipa(self):
+        """German words (no IPA entry) stay unchanged."""
+        from cv_ocr_engines import _insert_missing_ipa
+        result = _insert_missing_ipa("Anstecknadel", "british")
+        assert result == "Anstecknadel"
+
+
+class TestFixCellPhonetics:
+    """Tests for fix_cell_phonetics function."""
+
+    def test_english_column_cells_processed(self):
+        """Cells with col_type column_en should be processed."""
+        from cv_ocr_engines import fix_cell_phonetics
+        cells = [
+            {"cell_id": "c1", "col_type": "column_en", "text": "badge"},
+            {"cell_id": "c2", "col_type": "column_de", "text": "Anstecknadel"},
+        ]
+        fix_cell_phonetics(cells, pronunciation="british")
+        # English cell should have IPA
+        assert "[" in cells[0]["text"]
+        # German cell should be unchanged
+        assert cells[1]["text"] == "Anstecknadel"
+
+    def test_column_text_cells_processed(self):
+        """Cells with col_type column_text should be processed."""
+        from cv_ocr_engines import fix_cell_phonetics
+        cells = [
+            {"cell_id": "c1", "col_type": "column_text", "text": "challenge"},
+        ]
+        fix_cell_phonetics(cells, pronunciation="british")
+        assert "[" in cells[0]["text"]
+
+    def test_garbled_ipa_replaced(self):
+        """Garbled IPA brackets should be replaced with correct IPA."""
+        from cv_ocr_engines import fix_cell_phonetics
+        cells = [
+            {"cell_id": "c1", "col_type": "column_en", "text": "dance {'tfatno]"},
+        ]
+        fix_cell_phonetics(cells, pronunciation="british")
+        # Should have proper IPA now
+        text = cells[0]["text"]
+        assert "dance [" in text
+        assert "{'tfatno]" not in text
+
+    def test_empty_cells_unchanged(self):
+        """Empty cells should not cause errors."""
+        from cv_ocr_engines import fix_cell_phonetics
+        cells = [
+            {"cell_id": "c1", "col_type": "column_en", "text": ""},
+            {"cell_id": "c2", "col_type": "column_en", "text": None},
+        ]
+        fix_cell_phonetics(cells, pronunciation="british")
+        assert cells[0]["text"] == ""
+
+    def test_non_english_col_types_skipped(self):
+        """Cells with column_de, column_example etc. should not be processed."""
+        from cv_ocr_engines import fix_cell_phonetics
+        cells = [
+            {"cell_id": "c1", "col_type": "column_de", "text": "Eis (gefrorenes Wasser)"},
+            {"cell_id": "c2", "col_type": "column_example", "text": "(sich beschweren)"},
+        ]
+        fix_cell_phonetics(cells, pronunciation="british")
+        assert cells[0]["text"] == "Eis (gefrorenes Wasser)"
+        assert cells[1]["text"] == "(sich beschweren)"