"""Tests for merged-word splitting in cv_review.py. The OCR sometimes merges adjacent words when character spacing is tight, e.g. "atmyschool" → "at my school". The _try_split_merged_word() function uses dynamic programming + dictionary lookup to find valid splits. """ import pytest from cv_review import _try_split_merged_word, _spell_dict_knows, _SPELL_AVAILABLE pytestmark = pytest.mark.skipif( not _SPELL_AVAILABLE, reason="pyspellchecker not installed", ) class TestTrySplitMergedWord: """Tests for _try_split_merged_word().""" # --- Should split --- def test_atmyschool(self): result = _try_split_merged_word("atmyschool") assert result is not None words = result.lower().split() assert "at" in words assert "my" in words assert "school" in words def test_goodidea(self): result = _try_split_merged_word("goodidea") assert result is not None assert "good" in result.lower() assert "idea" in result.lower() def test_comeon(self): result = _try_split_merged_word("Comeon") assert result is not None assert result.startswith("Come") # preserves casing assert "on" in result.lower().split() def test_youknowthe(self): result = _try_split_merged_word("youknowthe") assert result is not None words = result.lower().split() assert "you" in words assert "know" in words assert "the" in words # --- Should NOT split --- def test_known_word_unchanged(self): """A known dictionary word should not be split.""" assert _try_split_merged_word("school") is None assert _try_split_merged_word("beautiful") is None assert _try_split_merged_word("together") is None def test_anew(self): result = _try_split_merged_word("anew") # "anew" is itself a known word, so should NOT be split # But "a new" is also valid. Dictionary decides. # If "anew" is known → None. If not → "a new". # Either way, both are acceptable. pass # depends on dictionary def test_imadea(self): result = _try_split_merged_word("Imadea") assert result is not None assert "made" in result.lower() or "I" in result def test_makeadecision(self): result = _try_split_merged_word("makeadecision") assert result is not None assert "make" in result.lower() assert "decision" in result.lower() def test_short_word(self): """Words < 4 chars should not be attempted.""" assert _try_split_merged_word("the") is None assert _try_split_merged_word("at") is None def test_nonsense(self): """Random letter sequences should not produce a split.""" result = _try_split_merged_word("xyzqwk") assert result is None # --- Casing preservation --- def test_preserves_capitalization(self): result = _try_split_merged_word("Goodidea") assert result is not None assert result.startswith("Good") # --- Edge cases --- def test_empty_string(self): assert _try_split_merged_word("") is None def test_none_safe(self): """Non-alpha input should be handled gracefully.""" # _try_split_merged_word is only called for .isalpha() tokens, # but test robustness anyway assert _try_split_merged_word("123") is None