Add merged-word splitting to OCR spell review
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 43s
CI / test-go-edu-search (push) Successful in 38s
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-agent-core (push) Has been cancelled
CI / test-python-klausur (push) Has been cancelled
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 43s
CI / test-go-edu-search (push) Successful in 38s
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-agent-core (push) Has been cancelled
CI / test-python-klausur (push) Has been cancelled
OCR often merges adjacent words when spacing is tight, e.g. "atmyschool" → "at my school", "goodidea" → "good idea". New _try_split_merged_word() uses dynamic programming to find the shortest sequence of dictionary words covering the token. Integrated as step 5 in _spell_fix_token() after general spell correction. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -720,6 +720,58 @@ def _spell_dict_knows(word: str) -> bool:
|
||||
return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
|
||||
|
||||
|
||||
def _try_split_merged_word(token: str) -> Optional[str]:
|
||||
"""Try to split a merged word like 'atmyschool' into 'at my school'.
|
||||
|
||||
Uses dynamic programming to find the shortest sequence of dictionary
|
||||
words that covers the entire token. Only returns a result when the
|
||||
split produces at least 2 words and ALL parts are known dictionary words.
|
||||
|
||||
Preserves original capitalisation by mapping back to the input string.
|
||||
"""
|
||||
if not _SPELL_AVAILABLE or len(token) < 5:
|
||||
return None
|
||||
|
||||
lower = token.lower()
|
||||
n = len(lower)
|
||||
|
||||
# dp[i] = shortest list of word lengths that covers lower[:i], or None
|
||||
dp: list = [None] * (n + 1)
|
||||
dp[0] = []
|
||||
|
||||
for i in range(1, n + 1):
|
||||
# Try all possible last-word lengths (2..min(i, 20))
|
||||
# Allow single-char words only for 'a' and 'I'
|
||||
min_len = 1
|
||||
for j in range(max(0, i - 20), i):
|
||||
if dp[j] is None:
|
||||
continue
|
||||
word_len = i - j
|
||||
candidate = lower[j:i]
|
||||
if word_len == 1 and candidate not in ('a', 'i'):
|
||||
continue
|
||||
if word_len < 2 and candidate not in ('a', 'i'):
|
||||
continue
|
||||
if _spell_dict_knows(candidate):
|
||||
new_split = dp[j] + [word_len]
|
||||
# Prefer fewer words (shorter split)
|
||||
if dp[i] is None or len(new_split) < len(dp[i]):
|
||||
dp[i] = new_split
|
||||
|
||||
if dp[n] is None or len(dp[n]) < 2:
|
||||
return None
|
||||
|
||||
# Reconstruct with original casing
|
||||
result = []
|
||||
pos = 0
|
||||
for wlen in dp[n]:
|
||||
result.append(token[pos:pos + wlen])
|
||||
pos += wlen
|
||||
|
||||
logger.debug("Split merged word: %r → %r", token, " ".join(result))
|
||||
return " ".join(result)
|
||||
|
||||
|
||||
def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
|
||||
"""Return corrected form of token, or None if no fix needed/possible.
|
||||
|
||||
@@ -777,6 +829,14 @@ def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
|
||||
correction = correction[0].upper() + correction[1:]
|
||||
if _spell_dict_knows(correction):
|
||||
return correction
|
||||
|
||||
# 5. Merged-word split: OCR often merges adjacent words when spacing
|
||||
# is too tight, e.g. "atmyschool" → "at my school"
|
||||
if len(token) >= 5 and token.isalpha():
|
||||
split = _try_split_merged_word(token)
|
||||
if split:
|
||||
return split
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
||||
85
klausur-service/backend/tests/test_word_split.py
Normal file
85
klausur-service/backend/tests/test_word_split.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""Tests for merged-word splitting in cv_review.py.
|
||||
|
||||
The OCR sometimes merges adjacent words when character spacing is tight,
|
||||
e.g. "atmyschool" → "at my school". The _try_split_merged_word() function
|
||||
uses dynamic programming + dictionary lookup to find valid splits.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from cv_review import _try_split_merged_word, _spell_dict_knows, _SPELL_AVAILABLE
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not _SPELL_AVAILABLE,
|
||||
reason="pyspellchecker not installed",
|
||||
)
|
||||
|
||||
|
||||
class TestTrySplitMergedWord:
|
||||
"""Tests for _try_split_merged_word()."""
|
||||
|
||||
# --- Should split ---
|
||||
|
||||
def test_atmyschool(self):
|
||||
result = _try_split_merged_word("atmyschool")
|
||||
assert result is not None
|
||||
words = result.lower().split()
|
||||
assert "at" in words
|
||||
assert "my" in words
|
||||
assert "school" in words
|
||||
|
||||
def test_goodidea(self):
|
||||
result = _try_split_merged_word("goodidea")
|
||||
assert result is not None
|
||||
assert "good" in result.lower()
|
||||
assert "idea" in result.lower()
|
||||
|
||||
def test_comeon(self):
|
||||
result = _try_split_merged_word("Comeon")
|
||||
assert result is not None
|
||||
assert result.startswith("Come") # preserves casing
|
||||
assert "on" in result.lower().split()
|
||||
|
||||
def test_youknowthe(self):
|
||||
result = _try_split_merged_word("youknowthe")
|
||||
assert result is not None
|
||||
words = result.lower().split()
|
||||
assert "you" in words
|
||||
assert "know" in words
|
||||
assert "the" in words
|
||||
|
||||
# --- Should NOT split ---
|
||||
|
||||
def test_known_word_unchanged(self):
|
||||
"""A known dictionary word should not be split."""
|
||||
assert _try_split_merged_word("school") is None
|
||||
assert _try_split_merged_word("beautiful") is None
|
||||
assert _try_split_merged_word("together") is None
|
||||
|
||||
def test_short_word(self):
|
||||
"""Words < 5 chars should not be attempted."""
|
||||
assert _try_split_merged_word("the") is None
|
||||
assert _try_split_merged_word("at") is None
|
||||
|
||||
def test_nonsense(self):
|
||||
"""Random letter sequences should not produce a split."""
|
||||
result = _try_split_merged_word("xyzqwk")
|
||||
assert result is None
|
||||
|
||||
# --- Casing preservation ---
|
||||
|
||||
def test_preserves_capitalization(self):
|
||||
result = _try_split_merged_word("Goodidea")
|
||||
assert result is not None
|
||||
assert result.startswith("Good")
|
||||
|
||||
# --- Edge cases ---
|
||||
|
||||
def test_empty_string(self):
|
||||
assert _try_split_merged_word("") is None
|
||||
|
||||
def test_none_safe(self):
|
||||
"""Non-alpha input should be handled gracefully."""
|
||||
# _try_split_merged_word is only called for .isalpha() tokens,
|
||||
# but test robustness anyway
|
||||
assert _try_split_merged_word("123") is None
|
||||
Reference in New Issue
Block a user