"""Tests for fix_cell_phonetics and _insert_missing_ipa.""" import pytest from unittest.mock import patch, MagicMock import sys import os sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) class TestInsertMissingIpa: """Tests for _insert_missing_ipa function.""" def test_single_headword_gets_ipa(self): """Single English headword should get IPA inserted.""" from cv_ocr_engines import _insert_missing_ipa result = _insert_missing_ipa("badge", "british") assert "[" in result and "]" in result assert result.startswith("badge [") def test_short_phrase_first_word_gets_ipa(self): """First real word in short phrase gets IPA.""" from cv_ocr_engines import _insert_missing_ipa result = _insert_missing_ipa("film", "british") assert "[" in result def test_long_sentence_unchanged(self): """Sentences with >6 words should not get IPA.""" from cv_ocr_engines import _insert_missing_ipa text = "Can I borrow your CD player from you please" result = _insert_missing_ipa(text, "british") assert result == text def test_existing_brackets_unchanged(self): """Text with existing brackets should not get double IPA.""" from cv_ocr_engines import _insert_missing_ipa text = "dance [dˈɑːns]" result = _insert_missing_ipa(text, "british") assert result == text def test_empty_text_unchanged(self): """Empty text returns empty.""" from cv_ocr_engines import _insert_missing_ipa assert _insert_missing_ipa("", "british") == "" assert _insert_missing_ipa(" ", "british") == "" def test_grammar_words_skipped(self): """Grammar particles should not get IPA.""" from cv_ocr_engines import _insert_missing_ipa # "sth" is in _GRAMMAR_BRACKET_WORDS result = _insert_missing_ipa("sth", "british") assert "[" not in result def test_german_word_no_ipa(self): """German words (no IPA entry) stay unchanged.""" from cv_ocr_engines import _insert_missing_ipa result = _insert_missing_ipa("Anstecknadel", "british") assert result == "Anstecknadel" def test_compound_word_schoolbag_gets_ipa(self): """R07: Compound word 'schoolbag' should get decomposed IPA (school+bag).""" from cv_ocr_engines import _insert_missing_ipa result = _insert_missing_ipa("schoolbag", "british") assert "[" in result and "]" in result assert result.startswith("schoolbag [") def test_compound_word_blackbird(self): """Compound word 'blackbird' should get decomposed IPA.""" from cv_ocr_engines import _insert_missing_ipa result = _insert_missing_ipa("blackbird", "british") assert "[" in result and "]" in result def test_compound_word_too_short(self): """Words shorter than 6 chars should not attempt compound decomposition.""" from cv_ocr_engines import _decompose_compound assert _decompose_compound("bag", "british") is None def test_decompose_compound_direct(self): """Direct test of _decompose_compound for known compounds.""" from cv_ocr_engines import _decompose_compound # schoolbag = school + bag — both should be in dictionary result = _decompose_compound("schoolbag", "british") assert result is not None class TestStripPostBracketGarbled: """Tests for _strip_post_bracket_garbled — trailing garbled IPA removal.""" def test_simple_trailing_garbled(self): """R21-simple: 'sea [sˈiː] si:' → trailing IPA marker removed.""" from cv_ocr_engines import _strip_post_bracket_garbled result = _strip_post_bracket_garbled("sea [sˈiː] si:") assert "si:" not in result assert result.startswith("sea [sˈiː]") def test_multi_word_trailing_garbled(self): """R21: 'seat [sˈiːt] belt si:t belt' → keep 'belt', remove garbled.""" from cv_ocr_engines import _strip_post_bracket_garbled result = _strip_post_bracket_garbled("seat [sˈiːt] belt si:t belt") assert "belt" in result # real word kept assert "si:t" not in result # garbled removed # Should contain "seat [sˈiːt] belt" but not the garbled duplication assert result.count("belt") == 1 def test_delimiter_after_bracket_kept(self): """Delimiters after IPA bracket are kept.""" from cv_ocr_engines import _strip_post_bracket_garbled result = _strip_post_bracket_garbled("dance [dˈɑːns] – tanzen") assert "– tanzen" in result def test_german_after_bracket_kept(self): """German words (uppercase) after IPA bracket are kept.""" from cv_ocr_engines import _strip_post_bracket_garbled result = _strip_post_bracket_garbled("badge [bædʒ] Abzeichen") assert "Abzeichen" in result class TestFixCellPhonetics: """Tests for fix_cell_phonetics function.""" def test_english_column_cells_processed(self): """Cells with col_type column_en should be processed.""" from cv_ocr_engines import fix_cell_phonetics cells = [ {"cell_id": "c1", "col_type": "column_en", "text": "badge"}, {"cell_id": "c2", "col_type": "column_de", "text": "Anstecknadel"}, ] fix_cell_phonetics(cells, pronunciation="british") # English cell should have IPA assert "[" in cells[0]["text"] # German cell should be unchanged assert cells[1]["text"] == "Anstecknadel" def test_column_text_cells_processed(self): """Cells with col_type column_text should be processed.""" from cv_ocr_engines import fix_cell_phonetics cells = [ {"cell_id": "c1", "col_type": "column_text", "text": "challenge"}, ] fix_cell_phonetics(cells, pronunciation="british") assert "[" in cells[0]["text"] def test_garbled_ipa_replaced(self): """Garbled IPA brackets should be replaced with correct IPA.""" from cv_ocr_engines import fix_cell_phonetics cells = [ {"cell_id": "c1", "col_type": "column_en", "text": "dance {'tfatno]"}, ] fix_cell_phonetics(cells, pronunciation="british") # Should have proper IPA now text = cells[0]["text"] assert "dance [" in text assert "{'tfatno]" not in text def test_empty_cells_unchanged(self): """Empty cells should not cause errors.""" from cv_ocr_engines import fix_cell_phonetics cells = [ {"cell_id": "c1", "col_type": "column_en", "text": ""}, {"cell_id": "c2", "col_type": "column_en", "text": None}, ] fix_cell_phonetics(cells, pronunciation="british") assert cells[0]["text"] == "" def test_non_english_col_types_skipped(self): """Cells with column_de, column_example etc. should not be processed.""" from cv_ocr_engines import fix_cell_phonetics cells = [ {"cell_id": "c1", "col_type": "column_de", "text": "Eis (gefrorenes Wasser)"}, {"cell_id": "c2", "col_type": "column_example", "text": "(sich beschweren)"}, ] fix_cell_phonetics(cells, pronunciation="british") assert cells[0]["text"] == "Eis (gefrorenes Wasser)" assert cells[1]["text"] == "(sich beschweren)"