Add merged-word splitting to OCR spell review

OCR often merges adjacent words when spacing is tight, e.g. "atmyschool" → "at my school", "goodidea" → "good idea". New _try_split_merged_word() uses dynamic programming to find the shortest sequence of dictionary words covering the token. Integrated as step 5 in _spell_fix_token() after general spell correction. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 14:11:16 +02:00
parent 633e301bfd
commit 9e2c301723
2 changed files with 145 additions and 0 deletions
@@ -720,6 +720,58 @@ def _spell_dict_knows(word: str) -> bool:
    return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))


+def _try_split_merged_word(token: str) -> Optional[str]:
+    """Try to split a merged word like 'atmyschool' into 'at my school'.
+
+    Uses dynamic programming to find the shortest sequence of dictionary
+    words that covers the entire token.  Only returns a result when the
+    split produces at least 2 words and ALL parts are known dictionary words.
+
+    Preserves original capitalisation by mapping back to the input string.
+    """
+    if not _SPELL_AVAILABLE or len(token) < 5:
+        return None
+
+    lower = token.lower()
+    n = len(lower)
+
+    # dp[i] = shortest list of word lengths that covers lower[:i], or None
+    dp: list = [None] * (n + 1)
+    dp[0] = []
+
+    for i in range(1, n + 1):
+        # Try all possible last-word lengths (2..min(i, 20))
+        # Allow single-char words only for 'a' and 'I'
+        min_len = 1
+        for j in range(max(0, i - 20), i):
+            if dp[j] is None:
+                continue
+            word_len = i - j
+            candidate = lower[j:i]
+            if word_len == 1 and candidate not in ('a', 'i'):
+                continue
+            if word_len < 2 and candidate not in ('a', 'i'):
+                continue
+            if _spell_dict_knows(candidate):
+                new_split = dp[j] + [word_len]
+                # Prefer fewer words (shorter split)
+                if dp[i] is None or len(new_split) < len(dp[i]):
+                    dp[i] = new_split
+
+    if dp[n] is None or len(dp[n]) < 2:
+        return None
+
+    # Reconstruct with original casing
+    result = []
+    pos = 0
+    for wlen in dp[n]:
+        result.append(token[pos:pos + wlen])
+        pos += wlen
+
+    logger.debug("Split merged word: %r → %r", token, " ".join(result))
+    return " ".join(result)
+
+
 def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
    """Return corrected form of token, or None if no fix needed/possible.

@@ -777,6 +829,14 @@ def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
                    correction = correction[0].upper() + correction[1:]
                if _spell_dict_knows(correction):
                    return correction
+
+    # 5. Merged-word split: OCR often merges adjacent words when spacing
+    #    is too tight, e.g. "atmyschool" → "at my school"
+    if len(token) >= 5 and token.isalpha():
+        split = _try_split_merged_word(token)
+        if split:
+            return split
+
    return None


@@ -0,0 +1,85 @@
+"""Tests for merged-word splitting in cv_review.py.
+
+The OCR sometimes merges adjacent words when character spacing is tight,
+e.g. "atmyschool" → "at my school".  The _try_split_merged_word() function
+uses dynamic programming + dictionary lookup to find valid splits.
+"""
+
+import pytest
+
+from cv_review import _try_split_merged_word, _spell_dict_knows, _SPELL_AVAILABLE
+
+pytestmark = pytest.mark.skipif(
+    not _SPELL_AVAILABLE,
+    reason="pyspellchecker not installed",
+)
+
+
+class TestTrySplitMergedWord:
+    """Tests for _try_split_merged_word()."""
+
+    # --- Should split ---
+
+    def test_atmyschool(self):
+        result = _try_split_merged_word("atmyschool")
+        assert result is not None
+        words = result.lower().split()
+        assert "at" in words
+        assert "my" in words
+        assert "school" in words
+
+    def test_goodidea(self):
+        result = _try_split_merged_word("goodidea")
+        assert result is not None
+        assert "good" in result.lower()
+        assert "idea" in result.lower()
+
+    def test_comeon(self):
+        result = _try_split_merged_word("Comeon")
+        assert result is not None
+        assert result.startswith("Come")  # preserves casing
+        assert "on" in result.lower().split()
+
+    def test_youknowthe(self):
+        result = _try_split_merged_word("youknowthe")
+        assert result is not None
+        words = result.lower().split()
+        assert "you" in words
+        assert "know" in words
+        assert "the" in words
+
+    # --- Should NOT split ---
+
+    def test_known_word_unchanged(self):
+        """A known dictionary word should not be split."""
+        assert _try_split_merged_word("school") is None
+        assert _try_split_merged_word("beautiful") is None
+        assert _try_split_merged_word("together") is None
+
+    def test_short_word(self):
+        """Words < 5 chars should not be attempted."""
+        assert _try_split_merged_word("the") is None
+        assert _try_split_merged_word("at") is None
+
+    def test_nonsense(self):
+        """Random letter sequences should not produce a split."""
+        result = _try_split_merged_word("xyzqwk")
+        assert result is None
+
+    # --- Casing preservation ---
+
+    def test_preserves_capitalization(self):
+        result = _try_split_merged_word("Goodidea")
+        assert result is not None
+        assert result.startswith("Good")
+
+    # --- Edge cases ---
+
+    def test_empty_string(self):
+        assert _try_split_merged_word("") is None
+
+    def test_none_safe(self):
+        """Non-alpha input should be handled gracefully."""
+        # _try_split_merged_word is only called for .isalpha() tokens,
+        # but test robustness anyway
+        assert _try_split_merged_word("123") is None