diff --git a/klausur-service/backend/cv_review.py b/klausur-service/backend/cv_review.py index ee7fc7a..f026a65 100644 --- a/klausur-service/backend/cv_review.py +++ b/klausur-service/backend/cv_review.py @@ -720,6 +720,58 @@ def _spell_dict_knows(word: str) -> bool: return bool(_en_spell.known([w])) or bool(_de_spell.known([w])) +def _try_split_merged_word(token: str) -> Optional[str]: + """Try to split a merged word like 'atmyschool' into 'at my school'. + + Uses dynamic programming to find the shortest sequence of dictionary + words that covers the entire token. Only returns a result when the + split produces at least 2 words and ALL parts are known dictionary words. + + Preserves original capitalisation by mapping back to the input string. + """ + if not _SPELL_AVAILABLE or len(token) < 5: + return None + + lower = token.lower() + n = len(lower) + + # dp[i] = shortest list of word lengths that covers lower[:i], or None + dp: list = [None] * (n + 1) + dp[0] = [] + + for i in range(1, n + 1): + # Try all possible last-word lengths (2..min(i, 20)) + # Allow single-char words only for 'a' and 'I' + min_len = 1 + for j in range(max(0, i - 20), i): + if dp[j] is None: + continue + word_len = i - j + candidate = lower[j:i] + if word_len == 1 and candidate not in ('a', 'i'): + continue + if word_len < 2 and candidate not in ('a', 'i'): + continue + if _spell_dict_knows(candidate): + new_split = dp[j] + [word_len] + # Prefer fewer words (shorter split) + if dp[i] is None or len(new_split) < len(dp[i]): + dp[i] = new_split + + if dp[n] is None or len(dp[n]) < 2: + return None + + # Reconstruct with original casing + result = [] + pos = 0 + for wlen in dp[n]: + result.append(token[pos:pos + wlen]) + pos += wlen + + logger.debug("Split merged word: %r → %r", token, " ".join(result)) + return " ".join(result) + + def _spell_fix_token(token: str, field: str = "") -> Optional[str]: """Return corrected form of token, or None if no fix needed/possible. @@ -777,6 +829,14 @@ def _spell_fix_token(token: str, field: str = "") -> Optional[str]: correction = correction[0].upper() + correction[1:] if _spell_dict_knows(correction): return correction + + # 5. Merged-word split: OCR often merges adjacent words when spacing + # is too tight, e.g. "atmyschool" → "at my school" + if len(token) >= 5 and token.isalpha(): + split = _try_split_merged_word(token) + if split: + return split + return None diff --git a/klausur-service/backend/tests/test_word_split.py b/klausur-service/backend/tests/test_word_split.py new file mode 100644 index 0000000..c2eb7e4 --- /dev/null +++ b/klausur-service/backend/tests/test_word_split.py @@ -0,0 +1,85 @@ +"""Tests for merged-word splitting in cv_review.py. + +The OCR sometimes merges adjacent words when character spacing is tight, +e.g. "atmyschool" → "at my school". The _try_split_merged_word() function +uses dynamic programming + dictionary lookup to find valid splits. +""" + +import pytest + +from cv_review import _try_split_merged_word, _spell_dict_knows, _SPELL_AVAILABLE + +pytestmark = pytest.mark.skipif( + not _SPELL_AVAILABLE, + reason="pyspellchecker not installed", +) + + +class TestTrySplitMergedWord: + """Tests for _try_split_merged_word().""" + + # --- Should split --- + + def test_atmyschool(self): + result = _try_split_merged_word("atmyschool") + assert result is not None + words = result.lower().split() + assert "at" in words + assert "my" in words + assert "school" in words + + def test_goodidea(self): + result = _try_split_merged_word("goodidea") + assert result is not None + assert "good" in result.lower() + assert "idea" in result.lower() + + def test_comeon(self): + result = _try_split_merged_word("Comeon") + assert result is not None + assert result.startswith("Come") # preserves casing + assert "on" in result.lower().split() + + def test_youknowthe(self): + result = _try_split_merged_word("youknowthe") + assert result is not None + words = result.lower().split() + assert "you" in words + assert "know" in words + assert "the" in words + + # --- Should NOT split --- + + def test_known_word_unchanged(self): + """A known dictionary word should not be split.""" + assert _try_split_merged_word("school") is None + assert _try_split_merged_word("beautiful") is None + assert _try_split_merged_word("together") is None + + def test_short_word(self): + """Words < 5 chars should not be attempted.""" + assert _try_split_merged_word("the") is None + assert _try_split_merged_word("at") is None + + def test_nonsense(self): + """Random letter sequences should not produce a split.""" + result = _try_split_merged_word("xyzqwk") + assert result is None + + # --- Casing preservation --- + + def test_preserves_capitalization(self): + result = _try_split_merged_word("Goodidea") + assert result is not None + assert result.startswith("Good") + + # --- Edge cases --- + + def test_empty_string(self): + assert _try_split_merged_word("") is None + + def test_none_safe(self): + """Non-alpha input should be handled gracefully.""" + # _try_split_merged_word is only called for .isalpha() tokens, + # but test robustness anyway + assert _try_split_merged_word("123") is None