Use spellchecker instead of pyphen for pipe autocorrect validation
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 2m29s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 20s

pyphen is a pattern-based hyphenator that accepts nonsense strings
like "Zeplpelin". Switch to spellchecker (frequency-based word list)
which correctly rejects garbled words and can suggest corrections.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-27 16:47:42 +01:00
parent cc4cb3bc2f
commit 925f4356ce

View File

@@ -55,6 +55,9 @@ _STOP_WORDS = frozenset([
_hyph_de = None
_hyph_en = None
# Cached spellchecker (for autocorrect_pipe_artifacts)
_spell_de = None
def _get_hyphenators():
"""Lazy-load pyphen hyphenators (cached across calls)."""
@@ -70,6 +73,19 @@ def _get_hyphenators():
return _hyph_de, _hyph_en
def _get_spellchecker():
"""Lazy-load German spellchecker (cached across calls)."""
global _spell_de
if _spell_de is not None:
return _spell_de
try:
from spellchecker import SpellChecker
except ImportError:
return None
_spell_de = SpellChecker(language='de')
return _spell_de
def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
"""Check whether pyphen recognises a word (DE or EN)."""
if len(word) < 2:
@@ -78,6 +94,14 @@ def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
or '|' in hyph_en.inserted(word, hyphen='|'))
def _is_real_word(word: str) -> bool:
"""Check whether spellchecker knows this word (case-insensitive)."""
spell = _get_spellchecker()
if spell is None:
return False
return word.lower() in spell
def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
"""Try to hyphenate a word using DE then EN dictionary.
@@ -92,54 +116,52 @@ def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
return None
def _autocorrect_piped_word(
word_with_pipes: str, hyph_de, hyph_en,
) -> Optional[str]:
def _autocorrect_piped_word(word_with_pipes: str) -> Optional[str]:
"""Try to correct a word that has OCR pipe artifacts.
Printed syllable divider lines on dictionary pages confuse OCR:
the vertical stroke is often read as an extra character (commonly
``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
Sometimes OCR reads one divider as ``|`` and another as a letter,
so the garbled character may be far from any detected pipe.
Uses ``spellchecker`` (frequency-based word list) for validation —
unlike pyphen which is a pattern-based hyphenator and accepts
nonsense strings like "Zeplpelin".
Strategy:
1. Strip ``|`` — if pyphen recognises the result, done.
2. Record where the pipes were in the stripped string.
3. Try deleting one character near each pipe position (the extra
character the OCR inserted). If pyphen recognises the
candidate, return it.
1. Strip ``|`` — if spellchecker knows the result, done.
2. Try deleting each pipe-like character (l, I, 1, i, t).
OCR inserts extra chars that resemble vertical strokes.
3. Fall back to spellchecker's own ``correction()`` method.
4. Preserve the original casing of the first letter.
"""
stripped = word_with_pipes.replace('|', '')
if not stripped or len(stripped) < 3:
return stripped # too short to validate
# Case-preserved check; pyphen is case-insensitive internally
if _is_known_word(stripped, hyph_de, hyph_en):
# Step 1: if the stripped word is already a real word, done
if _is_real_word(stripped):
return stripped
# Map pipe positions into the stripped string.
# e.g. "Ze|plpe|lin" → pipes were at indices 2, 6 in the original,
# which map to positions 2, 5 in "Zeplpelin".
pipe_positions: List[int] = []
offset = 0
for i, c in enumerate(word_with_pipes):
if c == '|':
pipe_positions.append(i - offset)
offset += 1
# Step 2: try deleting pipe-like characters (most likely artifacts)
_PIPE_LIKE = frozenset('lI1it')
for idx in range(len(stripped)):
if stripped[idx] not in _PIPE_LIKE:
continue
candidate = stripped[:idx] + stripped[idx + 1:]
if len(candidate) >= 3 and _is_real_word(candidate):
return candidate
# Try single-character deletion near each pipe position.
# OCR typically inserts ONE extra char per pipe stroke.
seen: set = set()
for pos in pipe_positions:
for delta in (0, 1, -1, 2, -2):
idx = pos + delta
if idx < 0 or idx >= len(stripped):
continue
candidate = stripped[:idx] + stripped[idx + 1:]
if candidate in seen or len(candidate) < 3:
continue
seen.add(candidate)
if _is_known_word(candidate, hyph_de, hyph_en):
return candidate
# Step 3: use spellchecker's built-in correction
spell = _get_spellchecker()
if spell is not None:
suggestion = spell.correction(stripped.lower())
if suggestion and suggestion != stripped.lower():
# Preserve original first-letter case
if stripped[0].isupper():
suggestion = suggestion[0].upper() + suggestion[1:]
return suggestion
return None # could not fix
@@ -154,16 +176,18 @@ def autocorrect_pipe_artifacts(
This function:
1. Strips ``|`` from every word in content cells.
2. Validates the stripped word with pyphen.
3. If not recognised, tries deleting characters that the OCR inserted
around the pipe position (e.g. ``Zeplpelin`` → ``Zeppelin``).
2. Validates with spellchecker (real dictionary lookup).
3. If not recognised, tries deleting pipe-like characters or uses
spellchecker's correction (e.g. ``Zeplpelin`` → ``Zeppelin``).
4. Updates both word-box texts and cell text.
Returns the number of cells modified.
"""
hyph_de, hyph_en = _get_hyphenators()
if hyph_de is None:
return 0
spell = _get_spellchecker()
if spell is None:
logger.warning("spellchecker not available — pipe autocorrect limited")
# Fall back: still strip pipes even without spellchecker
pass
modified = 0
for z in zones_data:
@@ -193,7 +217,7 @@ def autocorrect_pipe_artifacts(
if "|" not in core:
continue
corrected = _autocorrect_piped_word(core, hyph_de, hyph_en)
corrected = _autocorrect_piped_word(core)
if corrected is not None and corrected != core:
wb["text"] = lead + corrected + trail
cell_changed = True