""" Benchmark: Spell-checking & language detection approaches for OCR post-correction. Tests pyspellchecker (already used), symspellpy (candidate), and dual-dictionary language detection heuristic on real vocabulary OCR data. Run: pytest tests/test_spell_benchmark.py -v -s """ import time import pytest # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _load_pyspellchecker(): from spellchecker import SpellChecker en = SpellChecker(language='en', distance=1) de = SpellChecker(language='de', distance=1) return en, de def _load_symspellpy(): """Load symspellpy with English frequency dict (bundled).""" from symspellpy import SymSpell, Verbosity sym = SymSpell(max_dictionary_edit_distance=2) # Use bundled English frequency dict import pkg_resources dict_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt") sym.load_dictionary(dict_path, term_index=0, count_index=1) return sym, Verbosity # --------------------------------------------------------------------------- # Test data: (ocr_output, expected_correction, language, category) # --------------------------------------------------------------------------- OCR_TEST_CASES = [ # --- Single-char ambiguity --- ("l am a student", "I am a student", "en", "a_vs_I"), ("a book", "a book", "en", "a_vs_I"), # should NOT change ("I like cats", "I like cats", "en", "a_vs_I"), # should NOT change ("lt is raining", "It is raining", "en", "a_vs_I"), # l→I at start # --- Digit-letter confusion --- ("g0od", "good", "en", "digit_letter"), ("sch00l", "school", "en", "digit_letter"), ("he1lo", "hello", "en", "digit_letter"), ("Sch0n", "Schon", "de", "digit_letter"), # German # --- Umlaut drops --- ("schon", "schön", "de", "umlaut"), # context: "schon" is also valid DE! ("Schuler", "Schüler", "de", "umlaut"), ("uber", "über", "de", "umlaut"), ("Bucher", "Bücher", "de", "umlaut"), ("Turkei", "Türkei", "de", "umlaut"), # --- Common OCR errors --- ("beautful", "beautiful", "en", "missing_char"), ("teh", "the", "en", "transposition"), ("becasue", "because", "en", "transposition"), ("freind", "friend", "en", "swap"), ("Freund", "Freund", "de", "correct"), # already correct # --- Merged words --- ("atmyschool", "at my school", "en", "merged"), ("goodidea", "good idea", "en", "merged"), # --- Mixed language example sentences --- ("I go to teh school", "I go to the school", "en", "sentence"), ("Ich gehe zur Schule", "Ich gehe zur Schule", "de", "sentence_correct"), ] # Language detection test: (word, expected_language) LANG_DETECT_CASES = [ # Clear English ("school", "en"), ("beautiful", "en"), ("homework", "en"), ("yesterday", "en"), ("children", "en"), ("because", "en"), ("environment", "en"), ("although", "en"), # Clear German ("Schule", "de"), ("Hausaufgaben", "de"), ("Freundschaft", "de"), ("Umwelt", "de"), ("Kindergarten", "de"), # also used in English! ("Bücher", "de"), ("Straße", "de"), ("Entschuldigung", "de"), # Ambiguous (exist in both) ("Hand", "both"), ("Finger", "both"), ("Arm", "both"), ("Name", "both"), ("Ball", "both"), # Short/tricky ("a", "en"), ("I", "en"), ("in", "both"), ("an", "both"), ("the", "en"), ("die", "de"), ("der", "de"), ("to", "en"), ("zu", "de"), ] # =========================================================================== # Tests # =========================================================================== class TestPyspellchecker: """Test pyspellchecker capabilities for OCR correction.""" @pytest.fixture(autouse=True) def setup(self): self.en, self.de = _load_pyspellchecker() def test_known_words(self): """Verify basic dictionary lookup.""" assert self.en.known(["school"]) assert self.en.known(["beautiful"]) assert self.de.known(["schule"]) # lowercase assert self.de.known(["freund"]) # Not known assert not self.en.known(["xyzqwk"]) assert not self.de.known(["xyzqwk"]) def test_correction_quality(self): """Test correction suggestions for OCR errors.""" results = [] for ocr, expected, lang, category in OCR_TEST_CASES: if category in ("sentence", "sentence_correct", "merged", "a_vs_I"): continue # skip multi-word cases spell = self.en if lang == "en" else self.de words = ocr.split() corrected = [] for w in words: if spell.known([w.lower()]): corrected.append(w) else: fix = spell.correction(w.lower()) if fix and fix != w.lower(): # Preserve case if w[0].isupper(): fix = fix[0].upper() + fix[1:] corrected.append(fix) else: corrected.append(w) result = " ".join(corrected) ok = result == expected results.append((ocr, expected, result, ok, category)) if not ok: print(f" MISS: '{ocr}' → '{result}' (expected '{expected}') [{category}]") else: print(f" OK: '{ocr}' → '{result}' [{category}]") correct = sum(1 for *_, ok, _ in results if ok) total = len(results) print(f"\npyspellchecker: {correct}/{total} correct ({100*correct/total:.0f}%)") def test_language_detection_heuristic(self): """Test dual-dictionary language detection.""" results = [] for word, expected_lang in LANG_DETECT_CASES: w = word.lower() in_en = bool(self.en.known([w])) in_de = bool(self.de.known([w])) if in_en and in_de: detected = "both" elif in_en: detected = "en" elif in_de: detected = "de" else: detected = "unknown" ok = detected == expected_lang results.append((word, expected_lang, detected, ok)) if not ok: print(f" MISS: '{word}' → {detected} (expected {expected_lang})") else: print(f" OK: '{word}' → {detected}") correct = sum(1 for *_, ok in results if ok) total = len(results) print(f"\nLang detection heuristic: {correct}/{total} correct ({100*correct/total:.0f}%)") def test_umlaut_awareness(self): """Test if pyspellchecker suggests umlaut corrections.""" # "Schuler" should suggest "Schüler" candidates = self.de.candidates("schuler") print(f" 'schuler' candidates: {candidates}") # "uber" should suggest "über" candidates_uber = self.de.candidates("uber") print(f" 'uber' candidates: {candidates_uber}") # "Turkei" should suggest "Türkei" candidates_turkei = self.de.candidates("turkei") print(f" 'turkei' candidates: {candidates_turkei}") def test_speed_100_words(self): """Measure correction speed for 100 words.""" words_en = ["beautful", "teh", "becasue", "freind", "shcool", "homwork", "yesturday", "chilren", "becuse", "enviroment"] * 10 t0 = time.time() for w in words_en: self.en.correction(w) dt = time.time() - t0 print(f"\n pyspellchecker: 100 EN corrections in {dt*1000:.0f}ms") words_de = ["schuler", "bucher", "turkei", "strasze", "entschuldigung", "kindergaten", "freumd", "hauaufgaben", "umwlt", "ubung"] * 10 t0 = time.time() for w in words_de: self.de.correction(w) dt = time.time() - t0 print(f" pyspellchecker: 100 DE corrections in {dt*1000:.0f}ms") class TestSymspellpy: """Test symspellpy as a faster alternative.""" @pytest.fixture(autouse=True) def setup(self): try: self.sym, self.Verbosity = _load_symspellpy() self.available = True except (ImportError, FileNotFoundError) as e: self.available = False pytest.skip(f"symspellpy not installed: {e}") def test_correction_quality(self): """Test symspellpy corrections (EN only — no DE dict bundled).""" en_cases = [(o, e, c) for o, e, _, c in OCR_TEST_CASES if _ == "en" and c not in ("sentence", "sentence_correct", "merged", "a_vs_I")] results = [] for ocr, expected, category in en_cases: suggestions = self.sym.lookup(ocr.lower(), self.Verbosity.CLOSEST, max_edit_distance=2) if suggestions: fix = suggestions[0].term if ocr[0].isupper(): fix = fix[0].upper() + fix[1:] result = fix else: result = ocr ok = result == expected results.append((ocr, expected, result, ok, category)) status = "OK" if ok else "MISS" print(f" {status}: '{ocr}' → '{result}' (expected '{expected}') [{category}]") correct = sum(1 for *_, ok, _ in results if ok) total = len(results) print(f"\nsymspellpy EN: {correct}/{total} correct ({100*correct/total:.0f}%)") def test_speed_100_words(self): """Measure symspellpy correction speed for 100 words.""" words = ["beautful", "teh", "becasue", "freind", "shcool", "homwork", "yesturday", "chilren", "becuse", "enviroment"] * 10 t0 = time.time() for w in words: self.sym.lookup(w, self.Verbosity.CLOSEST, max_edit_distance=2) dt = time.time() - t0 print(f"\n symspellpy: 100 EN corrections in {dt*1000:.0f}ms") def test_compound_segmentation(self): """Test symspellpy's word segmentation for merged words.""" cases = [ ("atmyschool", "at my school"), ("goodidea", "good idea"), ("makeadecision", "make a decision"), ] for merged, expected in cases: result = self.sym.word_segmentation(merged) ok = result.corrected_string == expected status = "OK" if ok else "MISS" print(f" {status}: '{merged}' → '{result.corrected_string}' (expected '{expected}')") class TestContextDisambiguation: """Test context-based disambiguation for a/I and similar cases.""" @pytest.fixture(autouse=True) def setup(self): self.en, self.de = _load_pyspellchecker() def test_bigram_context(self): """Use simple bigram heuristic for a/I disambiguation. Approach: check if 'a ' or 'I ' is more common by checking if is a noun (follows 'a') or verb (follows 'I'). """ # Common words that follow "I" (verbs) i_followers = {"am", "was", "have", "had", "do", "did", "will", "would", "can", "could", "should", "shall", "may", "might", "think", "know", "see", "want", "need", "like", "love", "hate", "go", "went", "come", "came", "say", "said", "get", "got", "make", "made", "take", "took", "give", "gave", "tell", "told", "feel", "felt", "find", "found", "believe", "hope", "remember", "forget", "understand", "mean", "meant", "don't", "didn't", "can't", "won't", "couldn't", "shouldn't", "wouldn't", "haven't", "hadn't"} # Common words that follow "a" (nouns/adjectives) a_followers = {"lot", "few", "little", "bit", "good", "bad", "big", "small", "great", "new", "old", "long", "short", "man", "woman", "boy", "girl", "dog", "cat", "book", "car", "house", "day", "year", "nice", "beautiful", "large", "huge", "tiny"} def disambiguate_a_I(token: str, next_word: str) -> str: """Given an ambiguous 'a' or 'I' (or 'l'), pick the right one.""" nw = next_word.lower() if nw in i_followers: return "I" if nw in a_followers: return "a" # Fallback: if next word is known verb → I, known adj/noun → a # For now, use a simple heuristic: lowercase → "a", uppercase first letter → "I" return token # no change if uncertain cases = [ ("l", "am", "I"), ("l", "was", "I"), ("l", "think", "I"), ("a", "book", "a"), ("a", "cat", "a"), ("a", "lot", "a"), ("l", "big", "a"), # "a big ..." ("a", "have", "I"), # "I have ..." ] results = [] for token, next_word, expected in cases: result = disambiguate_a_I(token, next_word) ok = result == expected results.append((token, next_word, expected, result, ok)) status = "OK" if ok else "MISS" print(f" {status}: '{token} {next_word}...' → '{result}' (expected '{expected}')") correct = sum(1 for *_, ok in results if ok) total = len(results) print(f"\na/I disambiguation: {correct}/{total} correct ({100*correct/total:.0f}%)") class TestLangDetectLibrary: """Test py3langid or langdetect if available.""" def test_py3langid(self): try: import langid except ImportError: pytest.skip("langid not installed") sentences = [ ("I go to school every day", "en"), ("Ich gehe jeden Tag zur Schule", "de"), ("The weather is nice today", "en"), ("Das Wetter ist heute schön", "de"), ("She likes to play football", "en"), ("Er spielt gerne Fußball", "de"), ] results = [] for text, expected in sentences: lang, confidence = langid.classify(text) ok = lang == expected results.append(ok) status = "OK" if ok else "MISS" print(f" {status}: '{text[:40]}...' → {lang} ({confidence:.2f}) (expected {expected})") correct = sum(results) print(f"\nlangid sentence detection: {correct}/{len(results)} correct") def test_langid_single_words(self): """langid on single words — expected to be unreliable.""" try: import langid except ImportError: pytest.skip("langid not installed") words = [("school", "en"), ("Schule", "de"), ("book", "en"), ("Buch", "de"), ("car", "en"), ("Auto", "de"), ("a", "en"), ("I", "en"), ("der", "de"), ("the", "en")] results = [] for word, expected in words: lang, conf = langid.classify(word) ok = lang == expected results.append(ok) status = "OK" if ok else "MISS" print(f" {status}: '{word}' → {lang} ({conf:.2f}) (expected {expected})") correct = sum(results) print(f"\nlangid single-word: {correct}/{len(results)} correct") class TestIntegratedApproach: """Test the combined approach: dict-heuristic for lang + spell correction.""" @pytest.fixture(autouse=True) def setup(self): self.en, self.de = _load_pyspellchecker() def detect_language(self, word: str) -> str: """Dual-dict heuristic language detection.""" w = word.lower() # Skip very short words — too ambiguous if len(w) <= 2: return "ambiguous" in_en = bool(self.en.known([w])) in_de = bool(self.de.known([w])) if in_en and in_de: return "both" if in_en: return "en" if in_de: return "de" return "unknown" def correct_word(self, word: str, expected_lang: str) -> str: """Correct a single word given the expected language.""" w_lower = word.lower() spell = self.en if expected_lang == "en" else self.de # Already known if spell.known([w_lower]): return word # Also check the other language — might be fine other = self.de if expected_lang == "en" else self.en if other.known([w_lower]): return word # valid in the other language # Try correction fix = spell.correction(w_lower) if fix and fix != w_lower: if word[0].isupper(): fix = fix[0].upper() + fix[1:] return fix return word def test_full_pipeline(self): """Test: detect language → correct with appropriate dict.""" vocab_entries = [ # (english_col, german_col, expected_en, expected_de) ("beautful", "schön", "beautiful", "schön"), ("school", "Schule", "school", "Schule"), ("teh cat", "die Katze", "the cat", "die Katze"), ("freind", "Freund", "friend", "Freund"), ("homwork", "Hausaufgaben", "homework", "Hausaufgaben"), ("Schuler", "Schuler", "Schuler", "Schüler"), # DE umlaut: Schüler ] en_correct = 0 de_correct = 0 total = len(vocab_entries) for en_ocr, de_ocr, exp_en, exp_de in vocab_entries: # Correct each word in the column en_words = en_ocr.split() de_words = de_ocr.split() en_fixed = " ".join(self.correct_word(w, "en") for w in en_words) de_fixed = " ".join(self.correct_word(w, "de") for w in de_words) en_ok = en_fixed == exp_en de_ok = de_fixed == exp_de en_correct += en_ok de_correct += de_ok en_status = "OK" if en_ok else "MISS" de_status = "OK" if de_ok else "MISS" print(f" EN {en_status}: '{en_ocr}' → '{en_fixed}' (expected '{exp_en}')") print(f" DE {de_status}: '{de_ocr}' → '{de_fixed}' (expected '{exp_de}')") print(f"\nEN corrections: {en_correct}/{total} correct") print(f"DE corrections: {de_correct}/{total} correct")