Files
breakpilot-lehrer/klausur-service/backend/tests/test_word_split.py
Benjamin Admin 9e2c301723
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 43s
CI / test-go-edu-search (push) Successful in 38s
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-agent-core (push) Has been cancelled
CI / test-python-klausur (push) Has been cancelled
Add merged-word splitting to OCR spell review
OCR often merges adjacent words when spacing is tight, e.g.
"atmyschool" → "at my school", "goodidea" → "good idea".

New _try_split_merged_word() uses dynamic programming to find the
shortest sequence of dictionary words covering the token. Integrated
as step 5 in _spell_fix_token() after general spell correction.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 14:11:16 +02:00

86 lines
2.7 KiB
Python

"""Tests for merged-word splitting in cv_review.py.
The OCR sometimes merges adjacent words when character spacing is tight,
e.g. "atmyschool""at my school". The _try_split_merged_word() function
uses dynamic programming + dictionary lookup to find valid splits.
"""
import pytest
from cv_review import _try_split_merged_word, _spell_dict_knows, _SPELL_AVAILABLE
pytestmark = pytest.mark.skipif(
not _SPELL_AVAILABLE,
reason="pyspellchecker not installed",
)
class TestTrySplitMergedWord:
"""Tests for _try_split_merged_word()."""
# --- Should split ---
def test_atmyschool(self):
result = _try_split_merged_word("atmyschool")
assert result is not None
words = result.lower().split()
assert "at" in words
assert "my" in words
assert "school" in words
def test_goodidea(self):
result = _try_split_merged_word("goodidea")
assert result is not None
assert "good" in result.lower()
assert "idea" in result.lower()
def test_comeon(self):
result = _try_split_merged_word("Comeon")
assert result is not None
assert result.startswith("Come") # preserves casing
assert "on" in result.lower().split()
def test_youknowthe(self):
result = _try_split_merged_word("youknowthe")
assert result is not None
words = result.lower().split()
assert "you" in words
assert "know" in words
assert "the" in words
# --- Should NOT split ---
def test_known_word_unchanged(self):
"""A known dictionary word should not be split."""
assert _try_split_merged_word("school") is None
assert _try_split_merged_word("beautiful") is None
assert _try_split_merged_word("together") is None
def test_short_word(self):
"""Words < 5 chars should not be attempted."""
assert _try_split_merged_word("the") is None
assert _try_split_merged_word("at") is None
def test_nonsense(self):
"""Random letter sequences should not produce a split."""
result = _try_split_merged_word("xyzqwk")
assert result is None
# --- Casing preservation ---
def test_preserves_capitalization(self):
result = _try_split_merged_word("Goodidea")
assert result is not None
assert result.startswith("Good")
# --- Edge cases ---
def test_empty_string(self):
assert _try_split_merged_word("") is None
def test_none_safe(self):
"""Non-alpha input should be handled gracefully."""
# _try_split_merged_word is only called for .isalpha() tokens,
# but test robustness anyway
assert _try_split_merged_word("123") is None