Use spellchecker instead of pyphen for pipe autocorrect validation

pyphen is a pattern-based hyphenator that accepts nonsense strings like "Zeplpelin". Switch to spellchecker (frequency-based word list) which correctly rejects garbled words and can suggest corrections. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-27 16:47:42 +01:00
parent cc4cb3bc2f
commit 925f4356ce
1 changed files with 64 additions and 40 deletions
@@ -55,6 +55,9 @@ _STOP_WORDS = frozenset([
 _hyph_de = None
 _hyph_en = None
 # Cached spellchecker (for autocorrect_pipe_artifacts)
 _spell_de = None
 def _get_hyphenators():
    """Lazy-load pyphen hyphenators (cached across calls)."""
@@ -70,6 +73,19 @@ def _get_hyphenators():
    return _hyph_de, _hyph_en
 def _get_spellchecker():
    """Lazy-load German spellchecker (cached across calls)."""
    global _spell_de
    if _spell_de is not None:
        return _spell_de
    try:
        from spellchecker import SpellChecker
    except ImportError:
        return None
    _spell_de = SpellChecker(language='de')
    return _spell_de
 def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
    """Check whether pyphen recognises a word (DE or EN)."""
    if len(word) < 2:
@@ -78,6 +94,14 @@ def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
            or '|' in hyph_en.inserted(word, hyphen='|'))
 def _is_real_word(word: str) -> bool:
    """Check whether spellchecker knows this word (case-insensitive)."""
    spell = _get_spellchecker()
    if spell is None:
        return False
    return word.lower() in spell
 def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
    """Try to hyphenate a word using DE then EN dictionary.
@@ -92,54 +116,52 @@ def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
    return None
-def _autocorrect_piped_word(
+def _autocorrect_piped_word(word_with_pipes: str) -> Optional[str]:
    word_with_pipes: str, hyph_de, hyph_en,
 ) -> Optional[str]:
    """Try to correct a word that has OCR pipe artifacts.
    Printed syllable divider lines on dictionary pages confuse OCR:
    the vertical stroke is often read as an extra character (commonly
    ``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
    Sometimes OCR reads one divider as ``|`` and another as a letter,
    so the garbled character may be far from any detected pipe.
    Uses ``spellchecker`` (frequency-based word list) for validation —
    unlike pyphen which is a pattern-based hyphenator and accepts
    nonsense strings like "Zeplpelin".
    Strategy:
-        1. Strip ``|`` — if pyphen recognises the result, done.
+        1. Strip ``|`` — if spellchecker knows the result, done.
-        2. Record where the pipes were in the stripped string.
+        2. Try deleting each pipe-like character (l, I, 1, i, t).
-        3. Try deleting one character near each pipe position (the extra
+           OCR inserts extra chars that resemble vertical strokes.
-           character the OCR inserted).  If pyphen recognises the
+        3. Fall back to spellchecker's own ``correction()`` method.
-           candidate, return it.
+        4. Preserve the original casing of the first letter.
    """
    stripped = word_with_pipes.replace('|', '')
    if not stripped or len(stripped) < 3:
        return stripped  # too short to validate
-    # Case-preserved check; pyphen is case-insensitive internally
+    # Step 1: if the stripped word is already a real word, done
-    if _is_known_word(stripped, hyph_de, hyph_en):
+    if _is_real_word(stripped):
        return stripped
-    # Map pipe positions into the stripped string.
+    # Step 2: try deleting pipe-like characters (most likely artifacts)
-    # e.g. "Ze|plpe|lin" → pipes were at indices 2, 6 in the original,
+    _PIPE_LIKE = frozenset('lI1it')
-    # which map to positions 2, 5 in "Zeplpelin".
+    for idx in range(len(stripped)):
-    pipe_positions: List[int] = []
+        if stripped[idx] not in _PIPE_LIKE:
-    offset = 0
+            continue
-    for i, c in enumerate(word_with_pipes):
+        candidate = stripped[:idx] + stripped[idx + 1:]
-        if c == '|':
+        if len(candidate) >= 3 and _is_real_word(candidate):
-            pipe_positions.append(i - offset)
+            return candidate
            offset += 1
-    # Try single-character deletion near each pipe position.
+    # Step 3: use spellchecker's built-in correction
-    # OCR typically inserts ONE extra char per pipe stroke.
+    spell = _get_spellchecker()
-    seen: set = set()
+    if spell is not None:
-    for pos in pipe_positions:
+        suggestion = spell.correction(stripped.lower())
-        for delta in (0, 1, -1, 2, -2):
+        if suggestion and suggestion != stripped.lower():
-            idx = pos + delta
+            # Preserve original first-letter case
-            if idx < 0 or idx >= len(stripped):
+            if stripped[0].isupper():
-                continue
+                suggestion = suggestion[0].upper() + suggestion[1:]
-            candidate = stripped[:idx] + stripped[idx + 1:]
+            return suggestion
            if candidate in seen or len(candidate) < 3:
                continue
            seen.add(candidate)
            if _is_known_word(candidate, hyph_de, hyph_en):
                return candidate
    return None  # could not fix
@@ -154,16 +176,18 @@ def autocorrect_pipe_artifacts(
    This function:
    1. Strips ``|`` from every word in content cells.
-    2. Validates the stripped word with pyphen.
+    2. Validates with spellchecker (real dictionary lookup).
-    3. If not recognised, tries deleting characters that the OCR inserted
+    3. If not recognised, tries deleting pipe-like characters or uses
-       around the pipe position (e.g. ``Zeplpelin`` → ``Zeppelin``).
+       spellchecker's correction (e.g. ``Zeplpelin`` → ``Zeppelin``).
    4. Updates both word-box texts and cell text.
    Returns the number of cells modified.
    """
-    hyph_de, hyph_en = _get_hyphenators()
+    spell = _get_spellchecker()
-    if hyph_de is None:
+    if spell is None:
-        return 0
+        logger.warning("spellchecker not available — pipe autocorrect limited")
        # Fall back: still strip pipes even without spellchecker
        pass
    modified = 0
    for z in zones_data:
@@ -193,7 +217,7 @@ def autocorrect_pipe_artifacts(
                if "|" not in core:
                    continue
-                corrected = _autocorrect_piped_word(core, hyph_de, hyph_en)
+                corrected = _autocorrect_piped_word(core)
                if corrected is not None and corrected != core:
                    wb["text"] = lead + corrected + trail
                    cell_changed = True