Add SmartSpellChecker + refactor vocab-worksheet page.tsx

SmartSpellChecker (klausur-service): - Language-aware OCR post-correction without LLMs - Dual-dictionary heuristic for EN/DE language detection - Context-based a/I disambiguation via bigram lookup - Multi-digit substitution (sch00l→school) - Cross-language guard (don't false-correct DE words in EN column) - Umlaut correction (Schuler→Schüler, uber→über) - Integrated into spell_review_entries_sync() pipeline - 31 tests, 9ms/100 corrections Vocab-worksheet refactoring (studio-v2): - Split 2337-line page.tsx into 14 files - Custom hook useVocabWorksheet.ts (all state + logic) - 9 components in components/ directory - types.ts, constants.ts for shared definitions Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 12:25:01 +02:00
parent 04fa01661c
commit 909d0729f6
17 changed files with 3545 additions and 2228 deletions
@@ -881,10 +881,25 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
    """Rule-based OCR correction: spell-checker + structural heuristics.

    Deterministic — never translates, never touches IPA, never hallucinates.
+    Uses SmartSpellChecker for language-aware corrections with context-based
+    disambiguation (a/I), multi-digit substitution, and cross-language guard.
    """
    t0 = time.time()
    changes: List[Dict] = []
    all_corrected: List[Dict] = []
+
+    # Use SmartSpellChecker if available, fall back to legacy _spell_fix_field
+    _smart = None
+    try:
+        from smart_spell import SmartSpellChecker
+        _smart = SmartSpellChecker()
+        logger.debug("spell_review: using SmartSpellChecker")
+    except Exception:
+        logger.debug("spell_review: SmartSpellChecker not available, using legacy")
+
+    # Map field names → language codes for SmartSpellChecker
+    _LANG_MAP = {"english": "en", "german": "de", "example": "auto"}
+
    for i, entry in enumerate(entries):
        e = dict(entry)
        # Page-ref normalization (always, regardless of review status)
@@ -907,9 +922,18 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
            old_val = (e.get(field_name) or "").strip()
            if not old_val:
                continue
-            # example field is mixed-language — try German first (for umlauts)
-            lang = "german" if field_name in ("german", "example") else "english"
-            new_val, was_changed = _spell_fix_field(old_val, field=lang)
+
+            if _smart:
+                # SmartSpellChecker path — language-aware, context-based
+                lang_code = _LANG_MAP.get(field_name, "en")
+                result = _smart.correct_text(old_val, lang=lang_code)
+                new_val = result.corrected
+                was_changed = result.changed
+            else:
+                # Legacy path
+                lang = "german" if field_name in ("german", "example") else "english"
+                new_val, was_changed = _spell_fix_field(old_val, field=lang)
+
            if was_changed and new_val != old_val:
                changes.append({
                    "row_index": e.get("row_index", i),
@@ -921,12 +945,13 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
                e["llm_corrected"] = True
        all_corrected.append(e)
    duration_ms = int((time.time() - t0) * 1000)
+    model_name = "smart-spell-checker" if _smart else "spell-checker"
    return {
        "entries_original": entries,
        "entries_corrected": all_corrected,
        "changes": changes,
        "skipped_count": 0,
-        "model_used": "spell-checker",
+        "model_used": model_name,
        "duration_ms": duration_ms,
    }

@@ -0,0 +1,369 @@
+"""
+SmartSpellChecker — Language-aware OCR post-correction without LLMs.
+
+Uses pyspellchecker (MIT) with dual EN+DE dictionaries for:
+- Automatic language detection per word (dual-dictionary heuristic)
+- OCR error correction (digit↔letter, umlauts, transpositions)
+- Context-based disambiguation (a/I, l/I) via bigram lookup
+- Mixed-language support for example sentences
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+"""
+
+import logging
+import re
+from dataclasses import dataclass, field
+from typing import Dict, List, Literal, Optional, Set, Tuple
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Init
+# ---------------------------------------------------------------------------
+
+try:
+    from spellchecker import SpellChecker as _SpellChecker
+    _en_spell = _SpellChecker(language='en', distance=1)
+    _de_spell = _SpellChecker(language='de', distance=1)
+    _AVAILABLE = True
+except ImportError:
+    _AVAILABLE = False
+    logger.warning("pyspellchecker not installed — SmartSpellChecker disabled")
+
+Lang = Literal["en", "de", "both", "unknown"]
+
+# ---------------------------------------------------------------------------
+# Bigram context for a/I disambiguation
+# ---------------------------------------------------------------------------
+
+# Words that commonly follow "I" (subject pronoun → verb/modal)
+_I_FOLLOWERS: frozenset = frozenset({
+    "am", "was", "have", "had", "do", "did", "will", "would", "can",
+    "could", "should", "shall", "may", "might", "must",
+    "think", "know", "see", "want", "need", "like", "love", "hate",
+    "go", "went", "come", "came", "say", "said", "get", "got",
+    "make", "made", "take", "took", "give", "gave", "tell", "told",
+    "feel", "felt", "find", "found", "believe", "hope", "wish",
+    "remember", "forget", "understand", "mean", "meant",
+    "don't", "didn't", "can't", "won't", "couldn't", "wouldn't",
+    "shouldn't", "haven't", "hadn't", "isn't", "wasn't",
+    "really", "just", "also", "always", "never", "often", "sometimes",
+})
+
+# Words that commonly follow "a" (article → noun/adjective)
+_A_FOLLOWERS: frozenset = frozenset({
+    "lot", "few", "little", "bit", "good", "bad", "great", "new", "old",
+    "long", "short", "big", "small", "large", "huge", "tiny",
+    "nice", "beautiful", "wonderful", "terrible", "horrible",
+    "man", "woman", "boy", "girl", "child", "dog", "cat", "bird",
+    "book", "car", "house", "room", "school", "teacher", "student",
+    "day", "week", "month", "year", "time", "place", "way",
+    "friend", "family", "person", "problem", "question", "story",
+    "very", "really", "quite", "rather", "pretty", "single",
+})
+
+# Digit→letter substitutions (OCR confusion)
+_DIGIT_SUBS: Dict[str, List[str]] = {
+    '0': ['o', 'O'],
+    '1': ['l', 'I'],
+    '5': ['s', 'S'],
+    '6': ['g', 'G'],
+    '8': ['b', 'B'],
+    '|': ['I', 'l'],
+}
+_SUSPICIOUS_CHARS = frozenset(_DIGIT_SUBS.keys())
+
+# Umlaut confusion: OCR drops dots (ü→u, ä→a, ö→o)
+_UMLAUT_MAP = {
+    'a': 'ä', 'o': 'ö', 'u': 'ü', 'i': 'ü',
+    'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü',
+}
+
+# Tokenizer
+_TOKEN_RE = re.compile(r"([A-Za-zÄÖÜäöüß'|]+)([^A-Za-zÄÖÜäöüß'|]*)")
+
+
+# ---------------------------------------------------------------------------
+# Data types
+# ---------------------------------------------------------------------------
+
+@dataclass
+class CorrectionResult:
+    original: str
+    corrected: str
+    lang_detected: Lang
+    changed: bool
+    changes: List[str] = field(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+# Core class
+# ---------------------------------------------------------------------------
+
+class SmartSpellChecker:
+    """Language-aware OCR spell checker using pyspellchecker (no LLM)."""
+
+    def __init__(self):
+        if not _AVAILABLE:
+            raise RuntimeError("pyspellchecker not installed")
+        self.en = _en_spell
+        self.de = _de_spell
+
+    # --- Language detection ---
+
+    def detect_word_lang(self, word: str) -> Lang:
+        """Detect language of a single word using dual-dict heuristic."""
+        w = word.lower().strip(".,;:!?\"'()")
+        if not w:
+            return "unknown"
+        in_en = bool(self.en.known([w]))
+        in_de = bool(self.de.known([w]))
+        if in_en and in_de:
+            return "both"
+        if in_en:
+            return "en"
+        if in_de:
+            return "de"
+        return "unknown"
+
+    def detect_text_lang(self, text: str) -> Lang:
+        """Detect dominant language of a text string (sentence/phrase)."""
+        words = re.findall(r"[A-Za-zÄÖÜäöüß]+", text)
+        if not words:
+            return "unknown"
+
+        en_count = 0
+        de_count = 0
+        for w in words:
+            lang = self.detect_word_lang(w)
+            if lang == "en":
+                en_count += 1
+            elif lang == "de":
+                de_count += 1
+            # "both" doesn't count for either
+
+        if en_count > de_count:
+            return "en"
+        if de_count > en_count:
+            return "de"
+        if en_count == de_count and en_count > 0:
+            return "both"
+        return "unknown"
+
+    # --- Single-word correction ---
+
+    def _known(self, word: str) -> bool:
+        """True if word is known in EN or DE dictionary."""
+        w = word.lower()
+        return bool(self.en.known([w])) or bool(self.de.known([w]))
+
+    def _known_in(self, word: str, lang: str) -> bool:
+        """True if word is known in a specific language dictionary."""
+        w = word.lower()
+        spell = self.en if lang == "en" else self.de
+        return bool(spell.known([w]))
+
+    def correct_word(self, word: str, lang: str = "en",
+                     prev_word: str = "", next_word: str = "") -> Optional[str]:
+        """Correct a single word for the given language.
+
+        Returns None if no correction needed, or the corrected string.
+
+        Args:
+            word: The word to check/correct
+            lang: Expected language ("en" or "de")
+            prev_word: Previous word (for context)
+            next_word: Next word (for context)
+        """
+        if not word or not word.strip():
+            return None
+
+        # Skip numbers, abbreviations with dots, very short tokens
+        if word.isdigit() or '.' in word:
+            return None
+
+        has_suspicious = any(ch in _SUSPICIOUS_CHARS for ch in word)
+
+        # 1. Already known → no fix
+        if self._known(word):
+            # But check a/I disambiguation for single-char words
+            if word.lower() in ('l', '|') and next_word:
+                return self._disambiguate_a_I(word, next_word)
+            return None
+
+        # 2. Digit/pipe substitution
+        if has_suspicious:
+            if word == '|':
+                return 'I'
+            # Try single-char substitutions
+            for i, ch in enumerate(word):
+                if ch not in _DIGIT_SUBS:
+                    continue
+                for replacement in _DIGIT_SUBS[ch]:
+                    candidate = word[:i] + replacement + word[i + 1:]
+                    if self._known(candidate):
+                        return candidate
+            # Try multi-char substitution (e.g., "sch00l" → "school")
+            multi = self._try_multi_digit_sub(word)
+            if multi:
+                return multi
+
+        # 3. Umlaut correction (German)
+        if lang == "de" and len(word) >= 3 and word.isalpha():
+            umlaut_fix = self._try_umlaut_fix(word)
+            if umlaut_fix:
+                return umlaut_fix
+
+        # 4. General spell correction
+        if not has_suspicious and len(word) >= 3 and word.isalpha():
+            # Safety: don't correct if the word is valid in the OTHER language
+            # (either directly or via umlaut fix)
+            other_lang = "de" if lang == "en" else "en"
+            if self._known_in(word, other_lang):
+                return None
+            if other_lang == "de" and self._try_umlaut_fix(word):
+                return None  # has a valid DE umlaut variant → don't touch
+
+            spell = self.en if lang == "en" else self.de
+            correction = spell.correction(word.lower())
+            if correction and correction != word.lower():
+                if word[0].isupper():
+                    correction = correction[0].upper() + correction[1:]
+                if self._known(correction):
+                    return correction
+
+        return None
+
+    # --- Multi-digit substitution ---
+
+    def _try_multi_digit_sub(self, word: str) -> Optional[str]:
+        """Try replacing multiple digits simultaneously."""
+        positions = [(i, ch) for i, ch in enumerate(word) if ch in _DIGIT_SUBS]
+        if len(positions) < 1 or len(positions) > 4:
+            return None
+
+        # Try all combinations (max 2^4 = 16 for 4 positions)
+        chars = list(word)
+        best = None
+        self._multi_sub_recurse(chars, positions, 0, best_result=[None])
+        return self._multi_sub_recurse_result
+
+    _multi_sub_recurse_result: Optional[str] = None
+
+    def _try_multi_digit_sub(self, word: str) -> Optional[str]:
+        """Try replacing multiple digits simultaneously using BFS."""
+        positions = [(i, ch) for i, ch in enumerate(word) if ch in _DIGIT_SUBS]
+        if not positions or len(positions) > 4:
+            return None
+
+        # BFS over substitution combinations
+        queue = [list(word)]
+        for pos, ch in positions:
+            next_queue = []
+            for current in queue:
+                # Keep original
+                next_queue.append(current[:])
+                # Try each substitution
+                for repl in _DIGIT_SUBS[ch]:
+                    variant = current[:]
+                    variant[pos] = repl
+                    next_queue.append(variant)
+            queue = next_queue
+
+        # Check which combinations produce known words
+        for combo in queue:
+            candidate = "".join(combo)
+            if candidate != word and self._known(candidate):
+                return candidate
+
+        return None
+
+    # --- Umlaut fix ---
+
+    def _try_umlaut_fix(self, word: str) -> Optional[str]:
+        """Try single-char umlaut substitutions for German words."""
+        for i, ch in enumerate(word):
+            if ch in _UMLAUT_MAP:
+                candidate = word[:i] + _UMLAUT_MAP[ch] + word[i + 1:]
+                if self._known(candidate):
+                    return candidate
+        return None
+
+    # --- a/I disambiguation ---
+
+    def _disambiguate_a_I(self, token: str, next_word: str) -> Optional[str]:
+        """Disambiguate 'a' vs 'I' (and OCR variants like 'l', '|')."""
+        nw = next_word.lower().strip(".,;:!?")
+        if nw in _I_FOLLOWERS:
+            return "I"
+        if nw in _A_FOLLOWERS:
+            return "a"
+        # Fallback: check if next word is more commonly a verb (→I) or noun/adj (→a)
+        # Simple heuristic: if next word starts with uppercase (and isn't first in sentence)
+        # it's likely a German noun following "I"... but in English context, uppercase
+        # after "I" is unusual.
+        return None  # uncertain, don't change
+
+    # --- Full text correction ---
+
+    def correct_text(self, text: str, lang: str = "en") -> CorrectionResult:
+        """Correct a full text string (field value).
+
+        Args:
+            text: The text to correct
+            lang: Expected language ("en" or "de")
+        """
+        if not text or not text.strip():
+            return CorrectionResult(text, text, "unknown", False)
+
+        detected = self.detect_text_lang(text) if lang == "auto" else lang
+
+        parts: List[str] = []
+        changes: List[str] = []
+        tokens = list(_TOKEN_RE.finditer(text))
+
+        for idx, m in enumerate(tokens):
+            token, sep = m.group(1), m.group(2)
+            next_word = tokens[idx + 1].group(1) if idx + 1 < len(tokens) else ""
+            prev_word = tokens[idx - 1].group(1) if idx > 0 else ""
+
+            correction = self.correct_word(
+                token, lang=detected if detected in ("en", "de") else "en",
+                prev_word=prev_word, next_word=next_word,
+            )
+            if correction and correction != token:
+                changes.append(f"{token}→{correction}")
+                parts.append(correction)
+            else:
+                parts.append(token)
+            parts.append(sep)
+
+        # Append any trailing text
+        last_end = tokens[-1].end() if tokens else 0
+        if last_end < len(text):
+            parts.append(text[last_end:])
+
+        corrected = "".join(parts)
+        return CorrectionResult(
+            original=text,
+            corrected=corrected,
+            lang_detected=detected,
+            changed=corrected != text,
+            changes=changes,
+        )
+
+    # --- Vocabulary entry correction ---
+
+    def correct_vocab_entry(self, english: str, german: str,
+                            example: str = "") -> Dict[str, CorrectionResult]:
+        """Correct a full vocabulary entry (EN + DE + example).
+
+        Uses column position to determine language — the most reliable signal.
+        """
+        results = {}
+        results["english"] = self.correct_text(english, lang="en")
+        results["german"] = self.correct_text(german, lang="de")
+        if example:
+            # For examples, auto-detect language
+            results["example"] = self.correct_text(example, lang="auto")
+        return results
@@ -0,0 +1,210 @@
+"""Tests for SmartSpellChecker — language-aware OCR post-correction."""
+
+import pytest
+import sys, os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from smart_spell import SmartSpellChecker, CorrectionResult
+
+
+@pytest.fixture
+def sc():
+    return SmartSpellChecker()
+
+
+# ─── Language Detection ──────────────────────────────────────────────────────
+
+
+class TestLanguageDetection:
+
+    def test_clear_english_words(self, sc):
+        for word in ("school", "beautiful", "homework", "yesterday", "because"):
+            assert sc.detect_word_lang(word) in ("en", "both"), f"{word} should be EN"
+
+    def test_clear_german_words(self, sc):
+        for word in ("Schule", "Hausaufgaben", "Freundschaft", "Straße", "Entschuldigung"):
+            assert sc.detect_word_lang(word) in ("de", "both"), f"{word} should be DE"
+
+    def test_ambiguous_words(self, sc):
+        """Words that exist in both languages."""
+        for word in ("Hand", "Finger", "Arm", "Name", "Ball"):
+            assert sc.detect_word_lang(word) == "both", f"{word} should be 'both'"
+
+    def test_unknown_words(self, sc):
+        assert sc.detect_word_lang("xyzqwk") == "unknown"
+        assert sc.detect_word_lang("") == "unknown"
+
+    def test_english_sentence(self, sc):
+        assert sc.detect_text_lang("I go to school every day") == "en"
+
+    def test_german_sentence(self, sc):
+        assert sc.detect_text_lang("Ich gehe jeden Tag zur Schule") == "de"
+
+    def test_mixed_sentence(self, sc):
+        # Dominant language should win
+        lang = sc.detect_text_lang("I like to play Fußball with my Freunde")
+        assert lang in ("en", "both")
+
+
+# ─── Single Word Correction ────────────────────────────────────────────────
+
+
+class TestSingleWordCorrection:
+
+    def test_known_word_not_changed(self, sc):
+        assert sc.correct_word("school", "en") is None
+        assert sc.correct_word("Freund", "de") is None
+
+    def test_digit_letter_single(self, sc):
+        assert sc.correct_word("g0od", "en") == "good"
+        assert sc.correct_word("he1lo", "en") == "hello"
+
+    def test_digit_letter_multi(self, sc):
+        """Multiple digit substitutions (e.g., sch00l)."""
+        result = sc.correct_word("sch00l", "en")
+        assert result == "school", f"Expected 'school', got '{result}'"
+
+    def test_pipe_to_I(self, sc):
+        assert sc.correct_word("|", "en") == "I"
+
+    def test_umlaut_schuler(self, sc):
+        assert sc.correct_word("Schuler", "de") == "Schüler"
+
+    def test_umlaut_uber(self, sc):
+        assert sc.correct_word("uber", "de") == "über"
+
+    def test_umlaut_bucher(self, sc):
+        assert sc.correct_word("Bucher", "de") == "Bücher"
+
+    def test_umlaut_turkei(self, sc):
+        assert sc.correct_word("Turkei", "de") == "Türkei"
+
+    def test_missing_char(self, sc):
+        assert sc.correct_word("beautful", "en") == "beautiful"
+
+    def test_transposition(self, sc):
+        assert sc.correct_word("teh", "en") == "the"
+
+    def test_swap(self, sc):
+        assert sc.correct_word("freind", "en") == "friend"
+
+    def test_no_false_correction_cross_lang(self, sc):
+        """Don't correct a word that's valid in the other language.
+
+        'Schuler' in the EN column should NOT be corrected to 'Schuyler'
+        because 'Schüler' is valid German — it's likely a German word
+        that ended up in the wrong column (or is a surname).
+        """
+        # Schuler is valid DE (after umlaut fix → Schüler), so
+        # in the EN column it should be left alone
+        result = sc.correct_word("Schuler", "en")
+        # Should either be None (no change) or not "Schuyler"
+        assert result != "Schuyler", "Should not false-correct German word in EN column"
+
+
+# ─── a/I Disambiguation ──────────────────────────────────────────────────────
+
+
+class TestAIDisambiguation:
+
+    def test_I_before_verb(self, sc):
+        assert sc._disambiguate_a_I("l", "am") == "I"
+        assert sc._disambiguate_a_I("l", "was") == "I"
+        assert sc._disambiguate_a_I("l", "think") == "I"
+        assert sc._disambiguate_a_I("l", "have") == "I"
+        assert sc._disambiguate_a_I("l", "don't") == "I"
+
+    def test_a_before_noun_adj(self, sc):
+        assert sc._disambiguate_a_I("a", "book") == "a"
+        assert sc._disambiguate_a_I("a", "cat") == "a"
+        assert sc._disambiguate_a_I("a", "big") == "a"
+        assert sc._disambiguate_a_I("a", "lot") == "a"
+
+    def test_uncertain_returns_none(self, sc):
+        """When context is ambiguous, return None (don't change)."""
+        assert sc._disambiguate_a_I("l", "xyzqwk") is None
+
+
+# ─── Full Text Correction ───────────────────────────────────────────────────
+
+
+class TestFullTextCorrection:
+
+    def test_english_sentence(self, sc):
+        result = sc.correct_text("teh cat is beautful", "en")
+        assert result.changed
+        assert "the" in result.corrected
+        assert "beautiful" in result.corrected
+
+    def test_german_sentence_no_change(self, sc):
+        result = sc.correct_text("Ich gehe zur Schule", "de")
+        assert not result.changed
+
+    def test_german_umlaut_fix(self, sc):
+        result = sc.correct_text("Der Schuler liest Bucher", "de")
+        assert "Schüler" in result.corrected
+        assert "Bücher" in result.corrected
+
+    def test_preserves_punctuation(self, sc):
+        result = sc.correct_text("teh cat, beautful!", "en")
+        assert "," in result.corrected
+        assert "!" in result.corrected
+
+    def test_empty_text(self, sc):
+        result = sc.correct_text("", "en")
+        assert not result.changed
+        assert result.corrected == ""
+
+
+# ─── Vocab Entry Correction ─────────────────────────────────────────────────
+
+
+class TestVocabEntryCorrection:
+
+    def test_basic_entry(self, sc):
+        results = sc.correct_vocab_entry(
+            english="beautful",
+            german="schön",
+        )
+        assert results["english"].corrected == "beautiful"
+        assert results["german"].changed is False
+
+    def test_umlaut_in_german(self, sc):
+        results = sc.correct_vocab_entry(
+            english="school",
+            german="Schuler",
+        )
+        assert results["english"].changed is False
+        assert results["german"].corrected == "Schüler"
+
+    def test_example_auto_detect(self, sc):
+        results = sc.correct_vocab_entry(
+            english="friend",
+            german="Freund",
+            example="My best freind lives in Berlin",
+        )
+        assert "friend" in results["example"].corrected
+
+
+# ─── Speed ─────────────────────────────────────────────────────────────────
+
+
+class TestSpeed:
+
+    def test_100_corrections_under_500ms(self, sc):
+        """100 word corrections should complete in under 500ms."""
+        import time
+        words = [
+            ("beautful", "en"), ("teh", "en"), ("freind", "en"),
+            ("homwork", "en"), ("yesturday", "en"),
+            ("Schuler", "de"), ("Bucher", "de"), ("Turkei", "de"),
+            ("uber", "de"), ("Ubung", "de"),
+        ] * 10
+
+        t0 = time.time()
+        for word, lang in words:
+            sc.correct_word(word, lang)
+        dt = time.time() - t0
+
+        print(f"\n  100 corrections in {dt*1000:.0f}ms")
+        assert dt < 0.5, f"Too slow: {dt*1000:.0f}ms"
@@ -0,0 +1,494 @@
+"""
+Benchmark: Spell-checking & language detection approaches for OCR post-correction.
+
+Tests pyspellchecker (already used), symspellpy (candidate), and
+dual-dictionary language detection heuristic on real vocabulary OCR data.
+
+Run:  pytest tests/test_spell_benchmark.py -v -s
+"""
+
+import time
+import pytest
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _load_pyspellchecker():
+    from spellchecker import SpellChecker
+    en = SpellChecker(language='en', distance=1)
+    de = SpellChecker(language='de', distance=1)
+    return en, de
+
+
+def _load_symspellpy():
+    """Load symspellpy with English frequency dict (bundled)."""
+    from symspellpy import SymSpell, Verbosity
+    sym = SymSpell(max_dictionary_edit_distance=2)
+    # Use bundled English frequency dict
+    import pkg_resources
+    dict_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
+    sym.load_dictionary(dict_path, term_index=0, count_index=1)
+    return sym, Verbosity
+
+
+# ---------------------------------------------------------------------------
+# Test data: (ocr_output, expected_correction, language, category)
+# ---------------------------------------------------------------------------
+
+OCR_TEST_CASES = [
+    # --- Single-char ambiguity ---
+    ("l am a student", "I am a student", "en", "a_vs_I"),
+    ("a book", "a book", "en", "a_vs_I"),  # should NOT change
+    ("I like cats", "I like cats", "en", "a_vs_I"),  # should NOT change
+    ("lt is raining", "It is raining", "en", "a_vs_I"),  # l→I at start
+
+    # --- Digit-letter confusion ---
+    ("g0od", "good", "en", "digit_letter"),
+    ("sch00l", "school", "en", "digit_letter"),
+    ("he1lo", "hello", "en", "digit_letter"),
+    ("Sch0n", "Schon", "de", "digit_letter"),  # German
+
+    # --- Umlaut drops ---
+    ("schon", "schön", "de", "umlaut"),  # context: "schon" is also valid DE!
+    ("Schuler", "Schüler", "de", "umlaut"),
+    ("uber", "über", "de", "umlaut"),
+    ("Bucher", "Bücher", "de", "umlaut"),
+    ("Turkei", "Türkei", "de", "umlaut"),
+
+    # --- Common OCR errors ---
+    ("beautful", "beautiful", "en", "missing_char"),
+    ("teh", "the", "en", "transposition"),
+    ("becasue", "because", "en", "transposition"),
+    ("freind", "friend", "en", "swap"),
+    ("Freund", "Freund", "de", "correct"),  # already correct
+
+    # --- Merged words ---
+    ("atmyschool", "at my school", "en", "merged"),
+    ("goodidea", "good idea", "en", "merged"),
+
+    # --- Mixed language example sentences ---
+    ("I go to teh school", "I go to the school", "en", "sentence"),
+    ("Ich gehe zur Schule", "Ich gehe zur Schule", "de", "sentence_correct"),
+]
+
+# Language detection test: (word, expected_language)
+LANG_DETECT_CASES = [
+    # Clear English
+    ("school", "en"),
+    ("beautiful", "en"),
+    ("homework", "en"),
+    ("yesterday", "en"),
+    ("children", "en"),
+    ("because", "en"),
+    ("environment", "en"),
+    ("although", "en"),
+
+    # Clear German
+    ("Schule", "de"),
+    ("Hausaufgaben", "de"),
+    ("Freundschaft", "de"),
+    ("Umwelt", "de"),
+    ("Kindergarten", "de"),  # also used in English!
+    ("Bücher", "de"),
+    ("Straße", "de"),
+    ("Entschuldigung", "de"),
+
+    # Ambiguous (exist in both)
+    ("Hand", "both"),
+    ("Finger", "both"),
+    ("Arm", "both"),
+    ("Name", "both"),
+    ("Ball", "both"),
+
+    # Short/tricky
+    ("a", "en"),
+    ("I", "en"),
+    ("in", "both"),
+    ("an", "both"),
+    ("the", "en"),
+    ("die", "de"),
+    ("der", "de"),
+    ("to", "en"),
+    ("zu", "de"),
+]
+
+
+# ===========================================================================
+# Tests
+# ===========================================================================
+
+
+class TestPyspellchecker:
+    """Test pyspellchecker capabilities for OCR correction."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        self.en, self.de = _load_pyspellchecker()
+
+    def test_known_words(self):
+        """Verify basic dictionary lookup."""
+        assert self.en.known(["school"])
+        assert self.en.known(["beautiful"])
+        assert self.de.known(["schule"])  # lowercase
+        assert self.de.known(["freund"])
+        # Not known
+        assert not self.en.known(["xyzqwk"])
+        assert not self.de.known(["xyzqwk"])
+
+    def test_correction_quality(self):
+        """Test correction suggestions for OCR errors."""
+        results = []
+        for ocr, expected, lang, category in OCR_TEST_CASES:
+            if category in ("sentence", "sentence_correct", "merged", "a_vs_I"):
+                continue  # skip multi-word cases
+
+            spell = self.en if lang == "en" else self.de
+            words = ocr.split()
+            corrected = []
+            for w in words:
+                if spell.known([w.lower()]):
+                    corrected.append(w)
+                else:
+                    fix = spell.correction(w.lower())
+                    if fix and fix != w.lower():
+                        # Preserve case
+                        if w[0].isupper():
+                            fix = fix[0].upper() + fix[1:]
+                        corrected.append(fix)
+                    else:
+                        corrected.append(w)
+            result = " ".join(corrected)
+            ok = result == expected
+            results.append((ocr, expected, result, ok, category))
+            if not ok:
+                print(f"  MISS: '{ocr}' → '{result}' (expected '{expected}') [{category}]")
+            else:
+                print(f"  OK:   '{ocr}' → '{result}' [{category}]")
+
+        correct = sum(1 for *_, ok, _ in results if ok)
+        total = len(results)
+        print(f"\npyspellchecker: {correct}/{total} correct ({100*correct/total:.0f}%)")
+
+    def test_language_detection_heuristic(self):
+        """Test dual-dictionary language detection."""
+        results = []
+        for word, expected_lang in LANG_DETECT_CASES:
+            w = word.lower()
+            in_en = bool(self.en.known([w]))
+            in_de = bool(self.de.known([w]))
+
+            if in_en and in_de:
+                detected = "both"
+            elif in_en:
+                detected = "en"
+            elif in_de:
+                detected = "de"
+            else:
+                detected = "unknown"
+
+            ok = detected == expected_lang
+            results.append((word, expected_lang, detected, ok))
+            if not ok:
+                print(f"  MISS: '{word}' → {detected} (expected {expected_lang})")
+            else:
+                print(f"  OK:   '{word}' → {detected}")
+
+        correct = sum(1 for *_, ok in results if ok)
+        total = len(results)
+        print(f"\nLang detection heuristic: {correct}/{total} correct ({100*correct/total:.0f}%)")
+
+    def test_umlaut_awareness(self):
+        """Test if pyspellchecker suggests umlaut corrections."""
+        # "Schuler" should suggest "Schüler"
+        candidates = self.de.candidates("schuler")
+        print(f"  'schuler' candidates: {candidates}")
+        # "uber" should suggest "über"
+        candidates_uber = self.de.candidates("uber")
+        print(f"  'uber' candidates: {candidates_uber}")
+        # "Turkei" should suggest "Türkei"
+        candidates_turkei = self.de.candidates("turkei")
+        print(f"  'turkei' candidates: {candidates_turkei}")
+
+    def test_speed_100_words(self):
+        """Measure correction speed for 100 words."""
+        words_en = ["beautful", "teh", "becasue", "freind", "shcool",
+                     "homwork", "yesturday", "chilren", "becuse", "enviroment"] * 10
+        t0 = time.time()
+        for w in words_en:
+            self.en.correction(w)
+        dt = time.time() - t0
+        print(f"\n  pyspellchecker: 100 EN corrections in {dt*1000:.0f}ms")
+
+        words_de = ["schuler", "bucher", "turkei", "strasze", "entschuldigung",
+                     "kindergaten", "freumd", "hauaufgaben", "umwlt", "ubung"] * 10
+        t0 = time.time()
+        for w in words_de:
+            self.de.correction(w)
+        dt = time.time() - t0
+        print(f"  pyspellchecker: 100 DE corrections in {dt*1000:.0f}ms")
+
+
+class TestSymspellpy:
+    """Test symspellpy as a faster alternative."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        try:
+            self.sym, self.Verbosity = _load_symspellpy()
+            self.available = True
+        except (ImportError, FileNotFoundError) as e:
+            self.available = False
+            pytest.skip(f"symspellpy not installed: {e}")
+
+    def test_correction_quality(self):
+        """Test symspellpy corrections (EN only — no DE dict bundled)."""
+        en_cases = [(o, e, c) for o, e, _, c in OCR_TEST_CASES
+                    if _ == "en" and c not in ("sentence", "sentence_correct", "merged", "a_vs_I")]
+
+        results = []
+        for ocr, expected, category in en_cases:
+            suggestions = self.sym.lookup(ocr.lower(), self.Verbosity.CLOSEST, max_edit_distance=2)
+            if suggestions:
+                fix = suggestions[0].term
+                if ocr[0].isupper():
+                    fix = fix[0].upper() + fix[1:]
+                result = fix
+            else:
+                result = ocr
+
+            ok = result == expected
+            results.append((ocr, expected, result, ok, category))
+            status = "OK" if ok else "MISS"
+            print(f"  {status}: '{ocr}' → '{result}' (expected '{expected}') [{category}]")
+
+        correct = sum(1 for *_, ok, _ in results if ok)
+        total = len(results)
+        print(f"\nsymspellpy EN: {correct}/{total} correct ({100*correct/total:.0f}%)")
+
+    def test_speed_100_words(self):
+        """Measure symspellpy correction speed for 100 words."""
+        words = ["beautful", "teh", "becasue", "freind", "shcool",
+                 "homwork", "yesturday", "chilren", "becuse", "enviroment"] * 10
+        t0 = time.time()
+        for w in words:
+            self.sym.lookup(w, self.Verbosity.CLOSEST, max_edit_distance=2)
+        dt = time.time() - t0
+        print(f"\n  symspellpy: 100 EN corrections in {dt*1000:.0f}ms")
+
+    def test_compound_segmentation(self):
+        """Test symspellpy's word segmentation for merged words."""
+        cases = [
+            ("atmyschool", "at my school"),
+            ("goodidea", "good idea"),
+            ("makeadecision", "make a decision"),
+        ]
+        for merged, expected in cases:
+            result = self.sym.word_segmentation(merged)
+            ok = result.corrected_string == expected
+            status = "OK" if ok else "MISS"
+            print(f"  {status}: '{merged}' → '{result.corrected_string}' (expected '{expected}')")
+
+
+class TestContextDisambiguation:
+    """Test context-based disambiguation for a/I and similar cases."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        self.en, self.de = _load_pyspellchecker()
+
+    def test_bigram_context(self):
+        """Use simple bigram heuristic for a/I disambiguation.
+
+        Approach: check if 'a <next_word>' or 'I <next_word>' is more
+        common by checking if <next_word> is a noun (follows 'a') or
+        verb (follows 'I').
+        """
+        # Common words that follow "I" (verbs)
+        i_followers = {"am", "was", "have", "had", "do", "did", "will",
+                       "would", "can", "could", "should", "shall", "may",
+                       "might", "think", "know", "see", "want", "need",
+                       "like", "love", "hate", "go", "went", "come",
+                       "came", "say", "said", "get", "got", "make", "made",
+                       "take", "took", "give", "gave", "tell", "told",
+                       "feel", "felt", "find", "found", "believe", "hope",
+                       "remember", "forget", "understand", "mean", "meant",
+                       "don't", "didn't", "can't", "won't", "couldn't",
+                       "shouldn't", "wouldn't", "haven't", "hadn't"}
+
+        # Common words that follow "a" (nouns/adjectives)
+        a_followers = {"lot", "few", "little", "bit", "good", "bad",
+                       "big", "small", "great", "new", "old", "long",
+                       "short", "man", "woman", "boy", "girl", "dog",
+                       "cat", "book", "car", "house", "day", "year",
+                       "nice", "beautiful", "large", "huge", "tiny"}
+
+        def disambiguate_a_I(token: str, next_word: str) -> str:
+            """Given an ambiguous 'a' or 'I' (or 'l'), pick the right one."""
+            nw = next_word.lower()
+            if nw in i_followers:
+                return "I"
+            if nw in a_followers:
+                return "a"
+            # Fallback: if next word is known verb → I, known adj/noun → a
+            # For now, use a simple heuristic: lowercase → "a", uppercase first letter → "I"
+            return token  # no change if uncertain
+
+        cases = [
+            ("l", "am", "I"),
+            ("l", "was", "I"),
+            ("l", "think", "I"),
+            ("a", "book", "a"),
+            ("a", "cat", "a"),
+            ("a", "lot", "a"),
+            ("l", "big", "a"),  # "a big ..."
+            ("a", "have", "I"),  # "I have ..."
+        ]
+
+        results = []
+        for token, next_word, expected in cases:
+            result = disambiguate_a_I(token, next_word)
+            ok = result == expected
+            results.append((token, next_word, expected, result, ok))
+            status = "OK" if ok else "MISS"
+            print(f"  {status}: '{token} {next_word}...' → '{result}' (expected '{expected}')")
+
+        correct = sum(1 for *_, ok in results if ok)
+        total = len(results)
+        print(f"\na/I disambiguation: {correct}/{total} correct ({100*correct/total:.0f}%)")
+
+
+class TestLangDetectLibrary:
+    """Test py3langid or langdetect if available."""
+
+    def test_py3langid(self):
+        try:
+            import langid
+        except ImportError:
+            pytest.skip("langid not installed")
+
+        sentences = [
+            ("I go to school every day", "en"),
+            ("Ich gehe jeden Tag zur Schule", "de"),
+            ("The weather is nice today", "en"),
+            ("Das Wetter ist heute schön", "de"),
+            ("She likes to play football", "en"),
+            ("Er spielt gerne Fußball", "de"),
+        ]
+
+        results = []
+        for text, expected in sentences:
+            lang, confidence = langid.classify(text)
+            ok = lang == expected
+            results.append(ok)
+            status = "OK" if ok else "MISS"
+            print(f"  {status}: '{text[:40]}...' → {lang} ({confidence:.2f}) (expected {expected})")
+
+        correct = sum(results)
+        print(f"\nlangid sentence detection: {correct}/{len(results)} correct")
+
+    def test_langid_single_words(self):
+        """langid on single words — expected to be unreliable."""
+        try:
+            import langid
+        except ImportError:
+            pytest.skip("langid not installed")
+
+        words = [("school", "en"), ("Schule", "de"), ("book", "en"),
+                 ("Buch", "de"), ("car", "en"), ("Auto", "de"),
+                 ("a", "en"), ("I", "en"), ("der", "de"), ("the", "en")]
+
+        results = []
+        for word, expected in words:
+            lang, conf = langid.classify(word)
+            ok = lang == expected
+            results.append(ok)
+            status = "OK" if ok else "MISS"
+            print(f"  {status}: '{word}' → {lang} ({conf:.2f}) (expected {expected})")
+
+        correct = sum(results)
+        print(f"\nlangid single-word: {correct}/{len(results)} correct")
+
+
+class TestIntegratedApproach:
+    """Test the combined approach: dict-heuristic for lang + spell correction."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        self.en, self.de = _load_pyspellchecker()
+
+    def detect_language(self, word: str) -> str:
+        """Dual-dict heuristic language detection."""
+        w = word.lower()
+        # Skip very short words — too ambiguous
+        if len(w) <= 2:
+            return "ambiguous"
+        in_en = bool(self.en.known([w]))
+        in_de = bool(self.de.known([w]))
+        if in_en and in_de:
+            return "both"
+        if in_en:
+            return "en"
+        if in_de:
+            return "de"
+        return "unknown"
+
+    def correct_word(self, word: str, expected_lang: str) -> str:
+        """Correct a single word given the expected language."""
+        w_lower = word.lower()
+        spell = self.en if expected_lang == "en" else self.de
+
+        # Already known
+        if spell.known([w_lower]):
+            return word
+
+        # Also check the other language — might be fine
+        other = self.de if expected_lang == "en" else self.en
+        if other.known([w_lower]):
+            return word  # valid in the other language
+
+        # Try correction
+        fix = spell.correction(w_lower)
+        if fix and fix != w_lower:
+            if word[0].isupper():
+                fix = fix[0].upper() + fix[1:]
+            return fix
+
+        return word
+
+    def test_full_pipeline(self):
+        """Test: detect language → correct with appropriate dict."""
+        vocab_entries = [
+            # (english_col, german_col, expected_en, expected_de)
+            ("beautful", "schön", "beautiful", "schön"),
+            ("school", "Schule", "school", "Schule"),
+            ("teh cat", "die Katze", "the cat", "die Katze"),
+            ("freind", "Freund", "friend", "Freund"),
+            ("homwork", "Hausaufgaben", "homework", "Hausaufgaben"),
+            ("Schuler", "Schuler", "Schuler", "Schüler"),  # DE umlaut: Schüler
+        ]
+
+        en_correct = 0
+        de_correct = 0
+        total = len(vocab_entries)
+
+        for en_ocr, de_ocr, exp_en, exp_de in vocab_entries:
+            # Correct each word in the column
+            en_words = en_ocr.split()
+            de_words = de_ocr.split()
+            en_fixed = " ".join(self.correct_word(w, "en") for w in en_words)
+            de_fixed = " ".join(self.correct_word(w, "de") for w in de_words)
+
+            en_ok = en_fixed == exp_en
+            de_ok = de_fixed == exp_de
+            en_correct += en_ok
+            de_correct += de_ok
+
+            en_status = "OK" if en_ok else "MISS"
+            de_status = "OK" if de_ok else "MISS"
+            print(f"  EN {en_status}: '{en_ocr}' → '{en_fixed}' (expected '{exp_en}')")
+            print(f"  DE {de_status}: '{de_ocr}' → '{de_fixed}' (expected '{exp_de}')")
+
+        print(f"\nEN corrections: {en_correct}/{total} correct")
+        print(f"DE corrections: {de_correct}/{total} correct")