Use spellchecker instead of pyphen for pipe autocorrect validation
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 2m29s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 20s

pyphen is a pattern-based hyphenator that accepts nonsense strings
like "Zeplpelin". Switch to spellchecker (frequency-based word list)
which correctly rejects garbled words and can suggest corrections.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-27 16:47:42 +01:00
parent cc4cb3bc2f
commit 925f4356ce

View File

@@ -55,6 +55,9 @@ _STOP_WORDS = frozenset([
_hyph_de = None _hyph_de = None
_hyph_en = None _hyph_en = None
# Cached spellchecker (for autocorrect_pipe_artifacts)
_spell_de = None
def _get_hyphenators(): def _get_hyphenators():
"""Lazy-load pyphen hyphenators (cached across calls).""" """Lazy-load pyphen hyphenators (cached across calls)."""
@@ -70,6 +73,19 @@ def _get_hyphenators():
return _hyph_de, _hyph_en return _hyph_de, _hyph_en
def _get_spellchecker():
"""Lazy-load German spellchecker (cached across calls)."""
global _spell_de
if _spell_de is not None:
return _spell_de
try:
from spellchecker import SpellChecker
except ImportError:
return None
_spell_de = SpellChecker(language='de')
return _spell_de
def _is_known_word(word: str, hyph_de, hyph_en) -> bool: def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
"""Check whether pyphen recognises a word (DE or EN).""" """Check whether pyphen recognises a word (DE or EN)."""
if len(word) < 2: if len(word) < 2:
@@ -78,6 +94,14 @@ def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
or '|' in hyph_en.inserted(word, hyphen='|')) or '|' in hyph_en.inserted(word, hyphen='|'))
def _is_real_word(word: str) -> bool:
"""Check whether spellchecker knows this word (case-insensitive)."""
spell = _get_spellchecker()
if spell is None:
return False
return word.lower() in spell
def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]: def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
"""Try to hyphenate a word using DE then EN dictionary. """Try to hyphenate a word using DE then EN dictionary.
@@ -92,54 +116,52 @@ def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
return None return None
def _autocorrect_piped_word( def _autocorrect_piped_word(word_with_pipes: str) -> Optional[str]:
word_with_pipes: str, hyph_de, hyph_en,
) -> Optional[str]:
"""Try to correct a word that has OCR pipe artifacts. """Try to correct a word that has OCR pipe artifacts.
Printed syllable divider lines on dictionary pages confuse OCR: Printed syllable divider lines on dictionary pages confuse OCR:
the vertical stroke is often read as an extra character (commonly the vertical stroke is often read as an extra character (commonly
``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears. ``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
Sometimes OCR reads one divider as ``|`` and another as a letter,
so the garbled character may be far from any detected pipe.
Uses ``spellchecker`` (frequency-based word list) for validation —
unlike pyphen which is a pattern-based hyphenator and accepts
nonsense strings like "Zeplpelin".
Strategy: Strategy:
1. Strip ``|`` — if pyphen recognises the result, done. 1. Strip ``|`` — if spellchecker knows the result, done.
2. Record where the pipes were in the stripped string. 2. Try deleting each pipe-like character (l, I, 1, i, t).
3. Try deleting one character near each pipe position (the extra OCR inserts extra chars that resemble vertical strokes.
character the OCR inserted). If pyphen recognises the 3. Fall back to spellchecker's own ``correction()`` method.
candidate, return it. 4. Preserve the original casing of the first letter.
""" """
stripped = word_with_pipes.replace('|', '') stripped = word_with_pipes.replace('|', '')
if not stripped or len(stripped) < 3: if not stripped or len(stripped) < 3:
return stripped # too short to validate return stripped # too short to validate
# Case-preserved check; pyphen is case-insensitive internally # Step 1: if the stripped word is already a real word, done
if _is_known_word(stripped, hyph_de, hyph_en): if _is_real_word(stripped):
return stripped return stripped
# Map pipe positions into the stripped string. # Step 2: try deleting pipe-like characters (most likely artifacts)
# e.g. "Ze|plpe|lin" → pipes were at indices 2, 6 in the original, _PIPE_LIKE = frozenset('lI1it')
# which map to positions 2, 5 in "Zeplpelin". for idx in range(len(stripped)):
pipe_positions: List[int] = [] if stripped[idx] not in _PIPE_LIKE:
offset = 0 continue
for i, c in enumerate(word_with_pipes): candidate = stripped[:idx] + stripped[idx + 1:]
if c == '|': if len(candidate) >= 3 and _is_real_word(candidate):
pipe_positions.append(i - offset) return candidate
offset += 1
# Try single-character deletion near each pipe position. # Step 3: use spellchecker's built-in correction
# OCR typically inserts ONE extra char per pipe stroke. spell = _get_spellchecker()
seen: set = set() if spell is not None:
for pos in pipe_positions: suggestion = spell.correction(stripped.lower())
for delta in (0, 1, -1, 2, -2): if suggestion and suggestion != stripped.lower():
idx = pos + delta # Preserve original first-letter case
if idx < 0 or idx >= len(stripped): if stripped[0].isupper():
continue suggestion = suggestion[0].upper() + suggestion[1:]
candidate = stripped[:idx] + stripped[idx + 1:] return suggestion
if candidate in seen or len(candidate) < 3:
continue
seen.add(candidate)
if _is_known_word(candidate, hyph_de, hyph_en):
return candidate
return None # could not fix return None # could not fix
@@ -154,16 +176,18 @@ def autocorrect_pipe_artifacts(
This function: This function:
1. Strips ``|`` from every word in content cells. 1. Strips ``|`` from every word in content cells.
2. Validates the stripped word with pyphen. 2. Validates with spellchecker (real dictionary lookup).
3. If not recognised, tries deleting characters that the OCR inserted 3. If not recognised, tries deleting pipe-like characters or uses
around the pipe position (e.g. ``Zeplpelin`` → ``Zeppelin``). spellchecker's correction (e.g. ``Zeplpelin`` → ``Zeppelin``).
4. Updates both word-box texts and cell text. 4. Updates both word-box texts and cell text.
Returns the number of cells modified. Returns the number of cells modified.
""" """
hyph_de, hyph_en = _get_hyphenators() spell = _get_spellchecker()
if hyph_de is None: if spell is None:
return 0 logger.warning("spellchecker not available — pipe autocorrect limited")
# Fall back: still strip pipes even without spellchecker
pass
modified = 0 modified = 0
for z in zones_data: for z in zones_data:
@@ -193,7 +217,7 @@ def autocorrect_pipe_artifacts(
if "|" not in core: if "|" not in core:
continue continue
corrected = _autocorrect_piped_word(core, hyph_de, hyph_en) corrected = _autocorrect_piped_word(core)
if corrected is not None and corrected != core: if corrected is not None and corrected != core:
wb["text"] = lead + corrected + trail wb["text"] = lead + corrected + trail
cell_changed = True cell_changed = True