Use spellchecker instead of pyphen for pipe autocorrect validation
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 2m29s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 20s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 2m29s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 20s
pyphen is a pattern-based hyphenator that accepts nonsense strings like "Zeplpelin". Switch to spellchecker (frequency-based word list) which correctly rejects garbled words and can suggest corrections. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -55,6 +55,9 @@ _STOP_WORDS = frozenset([
|
||||
_hyph_de = None
|
||||
_hyph_en = None
|
||||
|
||||
# Cached spellchecker (for autocorrect_pipe_artifacts)
|
||||
_spell_de = None
|
||||
|
||||
|
||||
def _get_hyphenators():
|
||||
"""Lazy-load pyphen hyphenators (cached across calls)."""
|
||||
@@ -70,6 +73,19 @@ def _get_hyphenators():
|
||||
return _hyph_de, _hyph_en
|
||||
|
||||
|
||||
def _get_spellchecker():
|
||||
"""Lazy-load German spellchecker (cached across calls)."""
|
||||
global _spell_de
|
||||
if _spell_de is not None:
|
||||
return _spell_de
|
||||
try:
|
||||
from spellchecker import SpellChecker
|
||||
except ImportError:
|
||||
return None
|
||||
_spell_de = SpellChecker(language='de')
|
||||
return _spell_de
|
||||
|
||||
|
||||
def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
|
||||
"""Check whether pyphen recognises a word (DE or EN)."""
|
||||
if len(word) < 2:
|
||||
@@ -78,6 +94,14 @@ def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
|
||||
or '|' in hyph_en.inserted(word, hyphen='|'))
|
||||
|
||||
|
||||
def _is_real_word(word: str) -> bool:
|
||||
"""Check whether spellchecker knows this word (case-insensitive)."""
|
||||
spell = _get_spellchecker()
|
||||
if spell is None:
|
||||
return False
|
||||
return word.lower() in spell
|
||||
|
||||
|
||||
def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
|
||||
"""Try to hyphenate a word using DE then EN dictionary.
|
||||
|
||||
@@ -92,54 +116,52 @@ def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
|
||||
return None
|
||||
|
||||
|
||||
def _autocorrect_piped_word(
|
||||
word_with_pipes: str, hyph_de, hyph_en,
|
||||
) -> Optional[str]:
|
||||
def _autocorrect_piped_word(word_with_pipes: str) -> Optional[str]:
|
||||
"""Try to correct a word that has OCR pipe artifacts.
|
||||
|
||||
Printed syllable divider lines on dictionary pages confuse OCR:
|
||||
the vertical stroke is often read as an extra character (commonly
|
||||
``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
|
||||
Sometimes OCR reads one divider as ``|`` and another as a letter,
|
||||
so the garbled character may be far from any detected pipe.
|
||||
|
||||
Uses ``spellchecker`` (frequency-based word list) for validation —
|
||||
unlike pyphen which is a pattern-based hyphenator and accepts
|
||||
nonsense strings like "Zeplpelin".
|
||||
|
||||
Strategy:
|
||||
1. Strip ``|`` — if pyphen recognises the result, done.
|
||||
2. Record where the pipes were in the stripped string.
|
||||
3. Try deleting one character near each pipe position (the extra
|
||||
character the OCR inserted). If pyphen recognises the
|
||||
candidate, return it.
|
||||
1. Strip ``|`` — if spellchecker knows the result, done.
|
||||
2. Try deleting each pipe-like character (l, I, 1, i, t).
|
||||
OCR inserts extra chars that resemble vertical strokes.
|
||||
3. Fall back to spellchecker's own ``correction()`` method.
|
||||
4. Preserve the original casing of the first letter.
|
||||
"""
|
||||
stripped = word_with_pipes.replace('|', '')
|
||||
if not stripped or len(stripped) < 3:
|
||||
return stripped # too short to validate
|
||||
|
||||
# Case-preserved check; pyphen is case-insensitive internally
|
||||
if _is_known_word(stripped, hyph_de, hyph_en):
|
||||
# Step 1: if the stripped word is already a real word, done
|
||||
if _is_real_word(stripped):
|
||||
return stripped
|
||||
|
||||
# Map pipe positions into the stripped string.
|
||||
# e.g. "Ze|plpe|lin" → pipes were at indices 2, 6 in the original,
|
||||
# which map to positions 2, 5 in "Zeplpelin".
|
||||
pipe_positions: List[int] = []
|
||||
offset = 0
|
||||
for i, c in enumerate(word_with_pipes):
|
||||
if c == '|':
|
||||
pipe_positions.append(i - offset)
|
||||
offset += 1
|
||||
# Step 2: try deleting pipe-like characters (most likely artifacts)
|
||||
_PIPE_LIKE = frozenset('lI1it')
|
||||
for idx in range(len(stripped)):
|
||||
if stripped[idx] not in _PIPE_LIKE:
|
||||
continue
|
||||
candidate = stripped[:idx] + stripped[idx + 1:]
|
||||
if len(candidate) >= 3 and _is_real_word(candidate):
|
||||
return candidate
|
||||
|
||||
# Try single-character deletion near each pipe position.
|
||||
# OCR typically inserts ONE extra char per pipe stroke.
|
||||
seen: set = set()
|
||||
for pos in pipe_positions:
|
||||
for delta in (0, 1, -1, 2, -2):
|
||||
idx = pos + delta
|
||||
if idx < 0 or idx >= len(stripped):
|
||||
continue
|
||||
candidate = stripped[:idx] + stripped[idx + 1:]
|
||||
if candidate in seen or len(candidate) < 3:
|
||||
continue
|
||||
seen.add(candidate)
|
||||
if _is_known_word(candidate, hyph_de, hyph_en):
|
||||
return candidate
|
||||
# Step 3: use spellchecker's built-in correction
|
||||
spell = _get_spellchecker()
|
||||
if spell is not None:
|
||||
suggestion = spell.correction(stripped.lower())
|
||||
if suggestion and suggestion != stripped.lower():
|
||||
# Preserve original first-letter case
|
||||
if stripped[0].isupper():
|
||||
suggestion = suggestion[0].upper() + suggestion[1:]
|
||||
return suggestion
|
||||
|
||||
return None # could not fix
|
||||
|
||||
@@ -154,16 +176,18 @@ def autocorrect_pipe_artifacts(
|
||||
This function:
|
||||
|
||||
1. Strips ``|`` from every word in content cells.
|
||||
2. Validates the stripped word with pyphen.
|
||||
3. If not recognised, tries deleting characters that the OCR inserted
|
||||
around the pipe position (e.g. ``Zeplpelin`` → ``Zeppelin``).
|
||||
2. Validates with spellchecker (real dictionary lookup).
|
||||
3. If not recognised, tries deleting pipe-like characters or uses
|
||||
spellchecker's correction (e.g. ``Zeplpelin`` → ``Zeppelin``).
|
||||
4. Updates both word-box texts and cell text.
|
||||
|
||||
Returns the number of cells modified.
|
||||
"""
|
||||
hyph_de, hyph_en = _get_hyphenators()
|
||||
if hyph_de is None:
|
||||
return 0
|
||||
spell = _get_spellchecker()
|
||||
if spell is None:
|
||||
logger.warning("spellchecker not available — pipe autocorrect limited")
|
||||
# Fall back: still strip pipes even without spellchecker
|
||||
pass
|
||||
|
||||
modified = 0
|
||||
for z in zones_data:
|
||||
@@ -193,7 +217,7 @@ def autocorrect_pipe_artifacts(
|
||||
if "|" not in core:
|
||||
continue
|
||||
|
||||
corrected = _autocorrect_piped_word(core, hyph_de, hyph_en)
|
||||
corrected = _autocorrect_piped_word(core)
|
||||
if corrected is not None and corrected != core:
|
||||
wb["text"] = lead + corrected + trail
|
||||
cell_changed = True
|
||||
|
||||
Reference in New Issue
Block a user