feat(ocr-review): replace LLM with rule-based spell-checker (REVIEW_ENGINE=spell)

- Add pyspellchecker (MIT) to requirements for EN+DE dictionary lookup - New spell_review_entries_sync() + spell_review_entries_streaming(): - Dictionary-backed substitution: checks if corrected word is known - Structural rule: digit at pos 0 + lowercase rest → most likely letter (e.g. "8en"→"Ben", "8uch"→"Buch", "5ee"→"See", "6eld"→"Geld") - Pattern rule: "|." → "1." for numbered list prefixes - Standalone "|" → "I" (capital I) - IPA entries still protected via existing _entry_needs_review filter - Headings/untranslated words (e.g. "Story") are untouched (no susp. chars) - llm_review_entries + llm_review_entries_streaming: route via REVIEW_ENGINE env var ("spell" default, "llm" to restore previous behaviour) - docker-compose.yml: REVIEW_ENGINE=${REVIEW_ENGINE:-spell} - LLM code preserved for fallback (set REVIEW_ENGINE=llm in .env) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 15:04:27 +01:00
parent b1f7fee284
commit 21ea458fcf
3 changed files with 185 additions and 2 deletions
@@ -235,6 +235,7 @@ services:
      OLLAMA_CORRECTION_MODEL: ${OLLAMA_CORRECTION_MODEL:-llama3.2}
      OLLAMA_REVIEW_MODEL: ${OLLAMA_REVIEW_MODEL:-qwen3:0.6b}
      OLLAMA_REVIEW_BATCH_SIZE: ${OLLAMA_REVIEW_BATCH_SIZE:-20}
+      REVIEW_ENGINE: ${REVIEW_ENGINE:-spell}
      OCR_ENGINE: ${OCR_ENGINE:-auto}
      OLLAMA_HTR_MODEL: ${OLLAMA_HTR_MODEL:-qwen2.5vl:32b}
      HTR_FALLBACK_MODEL: ${HTR_FALLBACK_MODEL:-trocr-large}
@@ -5540,11 +5540,183 @@ def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict
    return changes, entries_out


+# ─── Spell-Checker OCR Review (Rule-Based, no LLM) ────────────────────────────
+
+REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell")   # "spell" (default) | "llm"
+
+try:
+    from spellchecker import SpellChecker as _SpellChecker
+    _en_spell = _SpellChecker(language='en', distance=1)
+    _de_spell = _SpellChecker(language='de', distance=1)
+    _SPELL_AVAILABLE = True
+    logger.info("pyspellchecker loaded (EN+DE), review engine: %s", REVIEW_ENGINE)
+except ImportError:
+    _SPELL_AVAILABLE = False
+    logger.warning("pyspellchecker not installed — falling back to LLM review")
+
+# Suspicious OCR chars → ordered list of most-likely correct replacements
+_SPELL_SUBS: Dict[str, List[str]] = {
+    '0': ['O', 'o'],
+    '1': ['l', 'I'],
+    '5': ['S', 's'],
+    '6': ['G', 'g'],
+    '8': ['B', 'b'],
+    '|': ['I', 'l', '1'],
+}
+_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys())
+
+# Tokenizer: word tokens (letters + pipe) alternating with separators
+_SPELL_TOKEN_RE = _re.compile(r'([A-Za-zÄÖÜäöüß|]+)([^A-Za-zÄÖÜäöüß|]*)')
+
+
+def _spell_dict_knows(word: str) -> bool:
+    """True if word is known in EN or DE dictionary."""
+    if not _SPELL_AVAILABLE:
+        return False
+    w = word.lower()
+    return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
+
+
+def _spell_fix_token(token: str) -> Optional[str]:
+    """Return corrected form of token, or None if no fix needed/possible."""
+    if not any(ch in _SPELL_SUSPICIOUS for ch in token):
+        return None
+    # Standalone pipe → capital I
+    if token == '|':
+        return 'I'
+    # Original is already a valid word → leave it
+    if _spell_dict_knows(token):
+        return None
+    # Dictionary-backed single-char substitution
+    for i, ch in enumerate(token):
+        if ch not in _SPELL_SUBS:
+            continue
+        for replacement in _SPELL_SUBS[ch]:
+            candidate = token[:i] + replacement + token[i + 1:]
+            if _spell_dict_knows(candidate):
+                return candidate
+    # Structural rule: suspicious char at position 0 + rest is all lowercase letters
+    # e.g. "8en"→"Ben", "8uch"→"Buch", "5ee"→"See", "6eld"→"Geld"
+    first = token[0]
+    if first in _SPELL_SUBS and len(token) >= 2:
+        rest = token[1:]
+        if rest.isalpha() and rest.islower():
+            candidate = _SPELL_SUBS[first][0] + rest
+            if not candidate[0].isdigit():
+                return candidate
+    return None
+
+
+def _spell_fix_field(text: str) -> Tuple[str, bool]:
+    """Apply OCR corrections to a text field. Returns (fixed_text, was_changed)."""
+    if not text or not any(ch in text for ch in _SPELL_SUSPICIOUS):
+        return text, False
+    # Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ")
+    fixed = _re.sub(r'(?<!\w)\|(?=[.,])', '1', text)
+    changed = fixed != text
+    # Tokenize and fix word by word
+    parts: List[str] = []
+    pos = 0
+    for m in _SPELL_TOKEN_RE.finditer(fixed):
+        token, sep = m.group(1), m.group(2)
+        correction = _spell_fix_token(token)
+        if correction:
+            parts.append(correction)
+            changed = True
+        else:
+            parts.append(token)
+        parts.append(sep)
+        pos = m.end()
+    if pos < len(fixed):
+        parts.append(fixed[pos:])
+    return ''.join(parts), changed
+
+
+def spell_review_entries_sync(entries: List[Dict]) -> Dict:
+    """Rule-based OCR correction: spell-checker + structural heuristics.
+
+    Deterministic — never translates, never touches IPA, never hallucinates.
+    """
+    t0 = time.time()
+    changes: List[Dict] = []
+    all_corrected: List[Dict] = []
+    for i, entry in enumerate(entries):
+        e = dict(entry)
+        if not _entry_needs_review(e):
+            all_corrected.append(e)
+            continue
+        for field_name in ("english", "german"):
+            old_val = (e.get(field_name) or "").strip()
+            if not old_val:
+                continue
+            new_val, was_changed = _spell_fix_field(old_val)
+            if was_changed and new_val != old_val:
+                changes.append({
+                    "row_index": e.get("row_index", i),
+                    "field": field_name,
+                    "old": old_val,
+                    "new": new_val,
+                })
+                e[field_name] = new_val
+                e["llm_corrected"] = True
+        all_corrected.append(e)
+    duration_ms = int((time.time() - t0) * 1000)
+    return {
+        "entries_original": entries,
+        "entries_corrected": all_corrected,
+        "changes": changes,
+        "skipped_count": 0,
+        "model_used": "spell-checker",
+        "duration_ms": duration_ms,
+    }
+
+
+async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50):
+    """Async generator yielding SSE-compatible events for spell-checker review."""
+    total = len(entries)
+    yield {
+        "type": "meta",
+        "total_entries": total,
+        "to_review": total,
+        "skipped": 0,
+        "model": "spell-checker",
+        "batch_size": batch_size,
+    }
+    result = spell_review_entries_sync(entries)
+    changes = result["changes"]
+    yield {
+        "type": "batch",
+        "batch_index": 0,
+        "entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)],
+        "changes": changes,
+        "duration_ms": result["duration_ms"],
+        "progress": {"current": total, "total": total},
+    }
+    yield {
+        "type": "complete",
+        "changes": changes,
+        "model_used": "spell-checker",
+        "duration_ms": result["duration_ms"],
+        "total_entries": total,
+        "reviewed": total,
+        "skipped": 0,
+        "corrections_found": len(changes),
+        "entries_corrected": result["entries_corrected"],
+    }
+
+# ─── End Spell-Checker ────────────────────────────────────────────────────────
+
+
 async def llm_review_entries(
    entries: List[Dict],
    model: str = None,
 ) -> Dict:
-    """Send vocab entries to a local LLM for OCR error correction (single batch)."""
+    """OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
+    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
+        return spell_review_entries_sync(entries)
+    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
+        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
+
    model = model or OLLAMA_REVIEW_MODEL

    # Filter: only entries that need review
@@ -5616,7 +5788,14 @@ async def llm_review_entries_streaming(
    model: str = None,
    batch_size: int = _REVIEW_BATCH_SIZE,
 ):
-    """Async generator: yield SSE events while reviewing entries in batches."""
+    """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE."""
+    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
+        async for event in spell_review_entries_streaming(entries, batch_size):
+            yield event
+        return
+    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
+        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
+
    model = model or OLLAMA_REVIEW_MODEL

    # Separate reviewable from skipped entries
@@ -35,6 +35,9 @@ onnxruntime
 # IPA pronunciation dictionary lookup (MIT license, bundled CMU dict ~134k words)
 eng-to-ipa

+# Spell-checker for rule-based OCR correction (MIT license)
+pyspellchecker>=0.8.1
+
 # PostgreSQL (for metrics storage)
 psycopg2-binary>=2.9.0
 asyncpg>=0.29.0