Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 50s
CI / test-go-edu-search (push) Successful in 46s
CI / test-python-klausur (push) Failing after 2m48s
CI / test-python-agent-core (push) Successful in 37s
CI / test-nodejs-website (push) Successful in 38s
Short merged words like "anew" (a new), "Imadea" (I made a), "makeadecision" (make a decision) were missed because the split threshold was too high. Now processes tokens >= 4 chars. English single-letter words (a, I) are already handled by the DP algorithm which allows them as valid split points. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
105 lines
3.4 KiB
Python
105 lines
3.4 KiB
Python
"""Tests for merged-word splitting in cv_review.py.
|
|
|
|
The OCR sometimes merges adjacent words when character spacing is tight,
|
|
e.g. "atmyschool" → "at my school". The _try_split_merged_word() function
|
|
uses dynamic programming + dictionary lookup to find valid splits.
|
|
"""
|
|
|
|
import pytest
|
|
|
|
from cv_review import _try_split_merged_word, _spell_dict_knows, _SPELL_AVAILABLE
|
|
|
|
pytestmark = pytest.mark.skipif(
|
|
not _SPELL_AVAILABLE,
|
|
reason="pyspellchecker not installed",
|
|
)
|
|
|
|
|
|
class TestTrySplitMergedWord:
|
|
"""Tests for _try_split_merged_word()."""
|
|
|
|
# --- Should split ---
|
|
|
|
def test_atmyschool(self):
|
|
result = _try_split_merged_word("atmyschool")
|
|
assert result is not None
|
|
words = result.lower().split()
|
|
assert "at" in words
|
|
assert "my" in words
|
|
assert "school" in words
|
|
|
|
def test_goodidea(self):
|
|
result = _try_split_merged_word("goodidea")
|
|
assert result is not None
|
|
assert "good" in result.lower()
|
|
assert "idea" in result.lower()
|
|
|
|
def test_comeon(self):
|
|
result = _try_split_merged_word("Comeon")
|
|
assert result is not None
|
|
assert result.startswith("Come") # preserves casing
|
|
assert "on" in result.lower().split()
|
|
|
|
def test_youknowthe(self):
|
|
result = _try_split_merged_word("youknowthe")
|
|
assert result is not None
|
|
words = result.lower().split()
|
|
assert "you" in words
|
|
assert "know" in words
|
|
assert "the" in words
|
|
|
|
# --- Should NOT split ---
|
|
|
|
def test_known_word_unchanged(self):
|
|
"""A known dictionary word should not be split."""
|
|
assert _try_split_merged_word("school") is None
|
|
assert _try_split_merged_word("beautiful") is None
|
|
assert _try_split_merged_word("together") is None
|
|
|
|
def test_anew(self):
|
|
result = _try_split_merged_word("anew")
|
|
# "anew" is itself a known word, so should NOT be split
|
|
# But "a new" is also valid. Dictionary decides.
|
|
# If "anew" is known → None. If not → "a new".
|
|
# Either way, both are acceptable.
|
|
pass # depends on dictionary
|
|
|
|
def test_imadea(self):
|
|
result = _try_split_merged_word("Imadea")
|
|
assert result is not None
|
|
assert "made" in result.lower() or "I" in result
|
|
|
|
def test_makeadecision(self):
|
|
result = _try_split_merged_word("makeadecision")
|
|
assert result is not None
|
|
assert "make" in result.lower()
|
|
assert "decision" in result.lower()
|
|
|
|
def test_short_word(self):
|
|
"""Words < 4 chars should not be attempted."""
|
|
assert _try_split_merged_word("the") is None
|
|
assert _try_split_merged_word("at") is None
|
|
|
|
def test_nonsense(self):
|
|
"""Random letter sequences should not produce a split."""
|
|
result = _try_split_merged_word("xyzqwk")
|
|
assert result is None
|
|
|
|
# --- Casing preservation ---
|
|
|
|
def test_preserves_capitalization(self):
|
|
result = _try_split_merged_word("Goodidea")
|
|
assert result is not None
|
|
assert result.startswith("Good")
|
|
|
|
# --- Edge cases ---
|
|
|
|
def test_empty_string(self):
|
|
assert _try_split_merged_word("") is None
|
|
|
|
def test_none_safe(self):
|
|
"""Non-alpha input should be handled gracefully."""
|
|
# _try_split_merged_word is only called for .isalpha() tokens,
|
|
# but test robustness anyway
|
|
assert _try_split_merged_word("123") is None
|