Use spellchecker instead of pyphen for pipe autocorrect validation

pyphen is a pattern-based hyphenator that accepts nonsense strings like "Zeplpelin". Switch to spellchecker (frequency-based word list) which correctly rejects garbled words and can suggest corrections. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-27 16:47:42 +01:00
parent cc4cb3bc2f
commit 925f4356ce
1 changed files with 64 additions and 40 deletions
@@ -55,6 +55,9 @@ _STOP_WORDS = frozenset([
 _hyph_de = None
 _hyph_en = None

+# Cached spellchecker (for autocorrect_pipe_artifacts)
+_spell_de = None
+

 def _get_hyphenators():
    """Lazy-load pyphen hyphenators (cached across calls)."""
@@ -70,6 +73,19 @@ def _get_hyphenators():
    return _hyph_de, _hyph_en


+def _get_spellchecker():
+    """Lazy-load German spellchecker (cached across calls)."""
+    global _spell_de
+    if _spell_de is not None:
+        return _spell_de
+    try:
+        from spellchecker import SpellChecker
+    except ImportError:
+        return None
+    _spell_de = SpellChecker(language='de')
+    return _spell_de
+
+
 def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
    """Check whether pyphen recognises a word (DE or EN)."""
    if len(word) < 2:
@@ -78,6 +94,14 @@ def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
            or '|' in hyph_en.inserted(word, hyphen='|'))


+def _is_real_word(word: str) -> bool:
+    """Check whether spellchecker knows this word (case-insensitive)."""
+    spell = _get_spellchecker()
+    if spell is None:
+        return False
+    return word.lower() in spell
+
+
 def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
    """Try to hyphenate a word using DE then EN dictionary.

@@ -92,54 +116,52 @@ def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
    return None


-def _autocorrect_piped_word(
-    word_with_pipes: str, hyph_de, hyph_en,
-) -> Optional[str]:
+def _autocorrect_piped_word(word_with_pipes: str) -> Optional[str]:
    """Try to correct a word that has OCR pipe artifacts.

    Printed syllable divider lines on dictionary pages confuse OCR:
    the vertical stroke is often read as an extra character (commonly
    ``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
+    Sometimes OCR reads one divider as ``|`` and another as a letter,
+    so the garbled character may be far from any detected pipe.
+
+    Uses ``spellchecker`` (frequency-based word list) for validation —
+    unlike pyphen which is a pattern-based hyphenator and accepts
+    nonsense strings like "Zeplpelin".

    Strategy:
-        1. Strip ``|`` — if pyphen recognises the result, done.
-        2. Record where the pipes were in the stripped string.
-        3. Try deleting one character near each pipe position (the extra
-           character the OCR inserted).  If pyphen recognises the
-           candidate, return it.
+        1. Strip ``|`` — if spellchecker knows the result, done.
+        2. Try deleting each pipe-like character (l, I, 1, i, t).
+           OCR inserts extra chars that resemble vertical strokes.
+        3. Fall back to spellchecker's own ``correction()`` method.
+        4. Preserve the original casing of the first letter.
    """
    stripped = word_with_pipes.replace('|', '')
    if not stripped or len(stripped) < 3:
        return stripped  # too short to validate

-    # Case-preserved check; pyphen is case-insensitive internally
-    if _is_known_word(stripped, hyph_de, hyph_en):
+    # Step 1: if the stripped word is already a real word, done
+    if _is_real_word(stripped):
        return stripped

-    # Map pipe positions into the stripped string.
-    # e.g. "Ze|plpe|lin" → pipes were at indices 2, 6 in the original,
-    # which map to positions 2, 5 in "Zeplpelin".
-    pipe_positions: List[int] = []
-    offset = 0
-    for i, c in enumerate(word_with_pipes):
-        if c == '|':
-            pipe_positions.append(i - offset)
-            offset += 1
+    # Step 2: try deleting pipe-like characters (most likely artifacts)
+    _PIPE_LIKE = frozenset('lI1it')
+    for idx in range(len(stripped)):
+        if stripped[idx] not in _PIPE_LIKE:
+            continue
+        candidate = stripped[:idx] + stripped[idx + 1:]
+        if len(candidate) >= 3 and _is_real_word(candidate):
+            return candidate

-    # Try single-character deletion near each pipe position.
-    # OCR typically inserts ONE extra char per pipe stroke.
-    seen: set = set()
-    for pos in pipe_positions:
-        for delta in (0, 1, -1, 2, -2):
-            idx = pos + delta
-            if idx < 0 or idx >= len(stripped):
-                continue
-            candidate = stripped[:idx] + stripped[idx + 1:]
-            if candidate in seen or len(candidate) < 3:
-                continue
-            seen.add(candidate)
-            if _is_known_word(candidate, hyph_de, hyph_en):
-                return candidate
+    # Step 3: use spellchecker's built-in correction
+    spell = _get_spellchecker()
+    if spell is not None:
+        suggestion = spell.correction(stripped.lower())
+        if suggestion and suggestion != stripped.lower():
+            # Preserve original first-letter case
+            if stripped[0].isupper():
+                suggestion = suggestion[0].upper() + suggestion[1:]
+            return suggestion

    return None  # could not fix

@@ -154,16 +176,18 @@ def autocorrect_pipe_artifacts(
    This function:

    1. Strips ``|`` from every word in content cells.
-    2. Validates the stripped word with pyphen.
-    3. If not recognised, tries deleting characters that the OCR inserted
-       around the pipe position (e.g. ``Zeplpelin`` → ``Zeppelin``).
+    2. Validates with spellchecker (real dictionary lookup).
+    3. If not recognised, tries deleting pipe-like characters or uses
+       spellchecker's correction (e.g. ``Zeplpelin`` → ``Zeppelin``).
    4. Updates both word-box texts and cell text.

    Returns the number of cells modified.
    """
-    hyph_de, hyph_en = _get_hyphenators()
-    if hyph_de is None:
-        return 0
+    spell = _get_spellchecker()
+    if spell is None:
+        logger.warning("spellchecker not available — pipe autocorrect limited")
+        # Fall back: still strip pipes even without spellchecker
+        pass

    modified = 0
    for z in zones_data:
@@ -193,7 +217,7 @@ def autocorrect_pipe_artifacts(
                if "|" not in core:
                    continue

-                corrected = _autocorrect_piped_word(core, hyph_de, hyph_en)
+                corrected = _autocorrect_piped_word(core)
                if corrected is not None and corrected != core:
                    wb["text"] = lead + corrected + trail
                    cell_changed = True