breakpilot-lehrer/klausur-service/backend/tests/test_word_split.py

"""Tests for merged-word splitting in cv_review.py.

The OCR sometimes merges adjacent words when character spacing is tight,
e.g. "atmyschool" → "at my school".  The _try_split_merged_word() function
uses dynamic programming + dictionary lookup to find valid splits.
"""

import pytest

from cv_review import _try_split_merged_word, _spell_dict_knows, _SPELL_AVAILABLE

pytestmark = pytest.mark.skipif(
    not _SPELL_AVAILABLE,
    reason="pyspellchecker not installed",
)


class TestTrySplitMergedWord:
    """Tests for _try_split_merged_word()."""

    # --- Should split ---

    def test_atmyschool(self):
        result = _try_split_merged_word("atmyschool")
        assert result is not None
        words = result.lower().split()
        assert "at" in words
        assert "my" in words
        assert "school" in words

    def test_goodidea(self):
        result = _try_split_merged_word("goodidea")
        assert result is not None
        assert "good" in result.lower()
        assert "idea" in result.lower()

    def test_comeon(self):
        result = _try_split_merged_word("Comeon")
        assert result is not None
        assert result.startswith("Come")  # preserves casing
        assert "on" in result.lower().split()

    def test_youknowthe(self):
        result = _try_split_merged_word("youknowthe")
        assert result is not None
        words = result.lower().split()
        assert "you" in words
        assert "know" in words
        assert "the" in words

    # --- Should NOT split ---

    def test_known_word_unchanged(self):
        """A known dictionary word should not be split."""
        assert _try_split_merged_word("school") is None
        assert _try_split_merged_word("beautiful") is None
        assert _try_split_merged_word("together") is None

    def test_short_word(self):
        """Words < 5 chars should not be attempted."""
        assert _try_split_merged_word("the") is None
        assert _try_split_merged_word("at") is None

    def test_nonsense(self):
        """Random letter sequences should not produce a split."""
        result = _try_split_merged_word("xyzqwk")
        assert result is None

    # --- Casing preservation ---

    def test_preserves_capitalization(self):
        result = _try_split_merged_word("Goodidea")
        assert result is not None
        assert result.startswith("Good")

    # --- Edge cases ---

    def test_empty_string(self):
        assert _try_split_merged_word("") is None

    def test_none_safe(self):
        """Non-alpha input should be handled gracefully."""
        # _try_split_merged_word is only called for .isalpha() tokens,
        # but test robustness anyway
        assert _try_split_merged_word("123") is None