breakpilot-lehrer/klausur-service/backend/tests/test_cell_phonetics.py

"""Tests for fix_cell_phonetics and _insert_missing_ipa."""

import pytest
from unittest.mock import patch, MagicMock

import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))


class TestInsertMissingIpa:
    """Tests for _insert_missing_ipa function."""

    def test_single_headword_gets_ipa(self):
        """Single English headword should get IPA inserted."""
        from ocr.engines.engines import _insert_missing_ipa
        result = _insert_missing_ipa("badge", "british")
        assert "[" in result and "]" in result
        assert result.startswith("badge [")

    def test_short_phrase_first_word_gets_ipa(self):
        """First real word in short phrase gets IPA."""
        from ocr.engines.engines import _insert_missing_ipa
        result = _insert_missing_ipa("film", "british")
        assert "[" in result

    def test_long_sentence_unchanged(self):
        """Sentences with >6 words should not get IPA."""
        from ocr.engines.engines import _insert_missing_ipa
        text = "Can I borrow your CD player from you please"
        result = _insert_missing_ipa(text, "british")
        assert result == text

    def test_existing_brackets_unchanged(self):
        """Text with existing brackets should not get double IPA."""
        from ocr.engines.engines import _insert_missing_ipa
        text = "dance [dˈɑːns]"
        result = _insert_missing_ipa(text, "british")
        assert result == text

    def test_empty_text_unchanged(self):
        """Empty text returns empty."""
        from ocr.engines.engines import _insert_missing_ipa
        assert _insert_missing_ipa("", "british") == ""
        assert _insert_missing_ipa("  ", "british") == ""

    def test_grammar_words_skipped(self):
        """Grammar particles should not get IPA."""
        from ocr.engines.engines import _insert_missing_ipa
        # "sth" is in _GRAMMAR_BRACKET_WORDS
        result = _insert_missing_ipa("sth", "british")
        assert "[" not in result

    def test_german_word_no_ipa(self):
        """German words (no IPA entry) stay unchanged."""
        from ocr.engines.engines import _insert_missing_ipa
        result = _insert_missing_ipa("Anstecknadel", "british")
        assert result == "Anstecknadel"

    def test_compound_word_schoolbag_gets_ipa(self):
        """R07: Compound word 'schoolbag' should get decomposed IPA (school+bag)."""
        from ocr.engines.engines import _insert_missing_ipa
        result = _insert_missing_ipa("schoolbag", "british")
        assert "[" in result and "]" in result
        assert result.startswith("schoolbag [")

    def test_compound_word_blackbird(self):
        """Compound word 'blackbird' should get decomposed IPA."""
        from ocr.engines.engines import _insert_missing_ipa
        result = _insert_missing_ipa("blackbird", "british")
        assert "[" in result and "]" in result

    def test_compound_word_too_short(self):
        """Words shorter than 6 chars should not attempt compound decomposition."""
        from ocr.engines.engines import _decompose_compound
        assert _decompose_compound("bag", "british") is None

    def test_decompose_compound_direct(self):
        """Direct test of _decompose_compound for known compounds."""
        from ocr.engines.engines import _decompose_compound
        # schoolbag = school + bag — both should be in dictionary
        result = _decompose_compound("schoolbag", "british")
        assert result is not None


class TestStripPostBracketGarbled:
    """Tests for _strip_post_bracket_garbled — trailing garbled IPA removal."""

    def test_simple_trailing_garbled(self):
        """R21-simple: 'sea [sˈiː] si:' → trailing IPA marker removed."""
        from ocr.engines.engines import _strip_post_bracket_garbled
        result = _strip_post_bracket_garbled("sea [sˈiː] si:")
        assert "si:" not in result
        assert result.startswith("sea [sˈiː]")

    def test_multi_word_trailing_garbled(self):
        """R21: 'seat [sˈiːt] belt si:t belt' → keep 'belt', remove garbled."""
        from ocr.engines.engines import _strip_post_bracket_garbled
        result = _strip_post_bracket_garbled("seat [sˈiːt] belt si:t belt")
        assert "belt" in result  # real word kept
        assert "si:t" not in result  # garbled removed
        # Should contain "seat [sˈiːt] belt" but not the garbled duplication
        assert result.count("belt") == 1

    def test_delimiter_after_bracket_kept(self):
        """Delimiters after IPA bracket are kept."""
        from ocr.engines.engines import _strip_post_bracket_garbled
        result = _strip_post_bracket_garbled("dance [dˈɑːns] – tanzen")
        assert "– tanzen" in result

    def test_german_after_bracket_kept(self):
        """German words (uppercase) after IPA bracket are kept."""
        from ocr.engines.engines import _strip_post_bracket_garbled
        result = _strip_post_bracket_garbled("badge [bædʒ] Abzeichen")
        assert "Abzeichen" in result


class TestFixCellPhonetics:
    """Tests for fix_cell_phonetics function."""

    def test_english_column_cells_processed(self):
        """Cells with col_type column_en should be processed."""
        from ocr.engines.engines import fix_cell_phonetics
        cells = [
            {"cell_id": "c1", "col_type": "column_en", "text": "badge"},
            {"cell_id": "c2", "col_type": "column_de", "text": "Anstecknadel"},
        ]
        fix_cell_phonetics(cells, pronunciation="british")
        # English cell should have IPA
        assert "[" in cells[0]["text"]
        # German cell should be unchanged
        assert cells[1]["text"] == "Anstecknadel"

    def test_column_text_cells_processed(self):
        """Cells with col_type column_text should be processed."""
        from ocr.engines.engines import fix_cell_phonetics
        cells = [
            {"cell_id": "c1", "col_type": "column_text", "text": "challenge"},
        ]
        fix_cell_phonetics(cells, pronunciation="british")
        assert "[" in cells[0]["text"]

    def test_garbled_ipa_replaced(self):
        """Garbled IPA brackets should be replaced with correct IPA."""
        from ocr.engines.engines import fix_cell_phonetics
        cells = [
            {"cell_id": "c1", "col_type": "column_en", "text": "dance {'tfatno]"},
        ]
        fix_cell_phonetics(cells, pronunciation="british")
        # Should have proper IPA now
        text = cells[0]["text"]
        assert "dance [" in text
        assert "{'tfatno]" not in text

    def test_empty_cells_unchanged(self):
        """Empty cells should not cause errors."""
        from ocr.engines.engines import fix_cell_phonetics
        cells = [
            {"cell_id": "c1", "col_type": "column_en", "text": ""},
            {"cell_id": "c2", "col_type": "column_en", "text": None},
        ]
        fix_cell_phonetics(cells, pronunciation="british")
        assert cells[0]["text"] == ""

    def test_non_english_col_types_skipped(self):
        """Cells with column_de, column_example etc. should not be processed."""
        from ocr.engines.engines import fix_cell_phonetics
        cells = [
            {"cell_id": "c1", "col_type": "column_de", "text": "Eis (gefrorenes Wasser)"},
            {"cell_id": "c2", "col_type": "column_example", "text": "(sich beschweren)"},
        ]
        fix_cell_phonetics(cells, pronunciation="british")
        assert cells[0]["text"] == "Eis (gefrorenes Wasser)"
        assert cells[1]["text"] == "(sich beschweren)"