breakpilot-lehrer/klausur-service/backend/tests/test_word_split.py

"""Tests for merged-word splitting in cv_review.py.

The OCR sometimes merges adjacent words when character spacing is tight,
e.g. "atmyschool" → "at my school".  The _try_split_merged_word() function
uses dynamic programming + dictionary lookup to find valid splits.
"""

import pytest

from cv_review import _try_split_merged_word, _spell_dict_knows, _SPELL_AVAILABLE

pytestmark = pytest.mark.skipif(
    not _SPELL_AVAILABLE,
    reason="pyspellchecker not installed",
)


class TestTrySplitMergedWord:
    """Tests for _try_split_merged_word()."""

    # --- Should split ---

    def test_atmyschool(self):
        result = _try_split_merged_word("atmyschool")
        assert result is not None
        words = result.lower().split()
        assert "at" in words
        assert "my" in words
        assert "school" in words

    def test_goodidea(self):
        result = _try_split_merged_word("goodidea")
        assert result is not None
        assert "good" in result.lower()
        assert "idea" in result.lower()

    def test_comeon(self):
        result = _try_split_merged_word("Comeon")
        assert result is not None
        assert result.startswith("Come")  # preserves casing
        assert "on" in result.lower().split()

    def test_youknowthe(self):
        result = _try_split_merged_word("youknowthe")
        assert result is not None
        words = result.lower().split()
        assert "you" in words
        assert "know" in words
        assert "the" in words

    # --- Should NOT split ---

    def test_known_word_unchanged(self):
        """A known dictionary word should not be split."""
        assert _try_split_merged_word("school") is None
        assert _try_split_merged_word("beautiful") is None
        assert _try_split_merged_word("together") is None

    def test_anew(self):
        result = _try_split_merged_word("anew")
        # "anew" is itself a known word, so should NOT be split
        # But "a new" is also valid. Dictionary decides.
        # If "anew" is known → None. If not → "a new".
        # Either way, both are acceptable.
        pass  # depends on dictionary

    def test_imadea(self):
        result = _try_split_merged_word("Imadea")
        assert result is not None
        assert "made" in result.lower() or "I" in result

    def test_makeadecision(self):
        result = _try_split_merged_word("makeadecision")
        assert result is not None
        assert "make" in result.lower()
        assert "decision" in result.lower()

    def test_short_word(self):
        """Words < 4 chars should not be attempted."""
        assert _try_split_merged_word("the") is None
        assert _try_split_merged_word("at") is None

    def test_nonsense(self):
        """Random letter sequences should not produce a split."""
        result = _try_split_merged_word("xyzqwk")
        assert result is None

    # --- Casing preservation ---

    def test_preserves_capitalization(self):
        result = _try_split_merged_word("Goodidea")
        assert result is not None
        assert result.startswith("Good")

    # --- Edge cases ---

    def test_empty_string(self):
        assert _try_split_merged_word("") is None

    def test_none_safe(self):
        """Non-alpha input should be handled gracefully."""
        # _try_split_merged_word is only called for .isalpha() tokens,
        # but test robustness anyway
        assert _try_split_merged_word("123") is None