"""Tests for SmartSpellChecker — language-aware OCR post-correction.""" import pytest import sys, os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from smart_spell import SmartSpellChecker, CorrectionResult @pytest.fixture def sc(): return SmartSpellChecker() # ─── Language Detection ────────────────────────────────────────────────────── class TestLanguageDetection: def test_clear_english_words(self, sc): for word in ("school", "beautiful", "homework", "yesterday", "because"): assert sc.detect_word_lang(word) in ("en", "both"), f"{word} should be EN" def test_clear_german_words(self, sc): for word in ("Schule", "Hausaufgaben", "Freundschaft", "Straße", "Entschuldigung"): assert sc.detect_word_lang(word) in ("de", "both"), f"{word} should be DE" def test_ambiguous_words(self, sc): """Words that exist in both languages.""" for word in ("Hand", "Finger", "Arm", "Name", "Ball"): assert sc.detect_word_lang(word) == "both", f"{word} should be 'both'" def test_unknown_words(self, sc): assert sc.detect_word_lang("xyzqwk") == "unknown" assert sc.detect_word_lang("") == "unknown" def test_english_sentence(self, sc): assert sc.detect_text_lang("I go to school every day") == "en" def test_german_sentence(self, sc): assert sc.detect_text_lang("Ich gehe jeden Tag zur Schule") == "de" def test_mixed_sentence(self, sc): # Dominant language should win lang = sc.detect_text_lang("I like to play Fußball with my Freunde") assert lang in ("en", "both") # ─── Single Word Correction ──────────────────────────────────────────────── class TestSingleWordCorrection: def test_known_word_not_changed(self, sc): assert sc.correct_word("school", "en") is None assert sc.correct_word("Freund", "de") is None def test_digit_letter_single(self, sc): assert sc.correct_word("g0od", "en") == "good" assert sc.correct_word("he1lo", "en") == "hello" def test_digit_letter_multi(self, sc): """Multiple digit substitutions (e.g., sch00l).""" result = sc.correct_word("sch00l", "en") assert result == "school", f"Expected 'school', got '{result}'" def test_pipe_to_I(self, sc): assert sc.correct_word("|", "en") == "I" def test_umlaut_schuler(self, sc): assert sc.correct_word("Schuler", "de") == "Schüler" def test_umlaut_uber(self, sc): assert sc.correct_word("uber", "de") == "über" def test_umlaut_bucher(self, sc): assert sc.correct_word("Bucher", "de") == "Bücher" def test_umlaut_turkei(self, sc): assert sc.correct_word("Turkei", "de") == "Türkei" def test_missing_char(self, sc): assert sc.correct_word("beautful", "en") == "beautiful" def test_transposition(self, sc): assert sc.correct_word("teh", "en") == "the" def test_swap(self, sc): assert sc.correct_word("freind", "en") == "friend" def test_no_false_correction_cross_lang(self, sc): """Don't correct a word that's valid in the other language. 'Schuler' in the EN column should NOT be corrected to 'Schuyler' because 'Schüler' is valid German — it's likely a German word that ended up in the wrong column (or is a surname). """ # Schuler is valid DE (after umlaut fix → Schüler), so # in the EN column it should be left alone result = sc.correct_word("Schuler", "en") # Should either be None (no change) or not "Schuyler" assert result != "Schuyler", "Should not false-correct German word in EN column" # ─── a/I Disambiguation ────────────────────────────────────────────────────── class TestAIDisambiguation: def test_I_before_verb(self, sc): assert sc._disambiguate_a_I("l", "am") == "I" assert sc._disambiguate_a_I("l", "was") == "I" assert sc._disambiguate_a_I("l", "think") == "I" assert sc._disambiguate_a_I("l", "have") == "I" assert sc._disambiguate_a_I("l", "don't") == "I" def test_a_before_noun_adj(self, sc): assert sc._disambiguate_a_I("a", "book") == "a" assert sc._disambiguate_a_I("a", "cat") == "a" assert sc._disambiguate_a_I("a", "big") == "a" assert sc._disambiguate_a_I("a", "lot") == "a" def test_uncertain_returns_none(self, sc): """When context is ambiguous, return None (don't change).""" assert sc._disambiguate_a_I("l", "xyzqwk") is None # ─── Full Text Correction ─────────────────────────────────────────────────── class TestFullTextCorrection: def test_english_sentence(self, sc): result = sc.correct_text("teh cat is beautful", "en") assert result.changed assert "the" in result.corrected assert "beautiful" in result.corrected def test_german_sentence_no_change(self, sc): result = sc.correct_text("Ich gehe zur Schule", "de") assert not result.changed def test_german_umlaut_fix(self, sc): result = sc.correct_text("Der Schuler liest Bucher", "de") assert "Schüler" in result.corrected assert "Bücher" in result.corrected def test_preserves_punctuation(self, sc): result = sc.correct_text("teh cat, beautful!", "en") assert "," in result.corrected assert "!" in result.corrected def test_empty_text(self, sc): result = sc.correct_text("", "en") assert not result.changed assert result.corrected == "" # ─── Boundary Repair ─────────────────────────────────────────────────────── class TestBoundaryRepair: def test_ats_th_to_at_sth(self, sc): """'ats th.' → 'at sth.' — shifted boundary with abbreviation.""" result = sc.correct_text("be good ats th.", "en") assert "at sth." in result.corrected, f"Expected 'at sth.' in '{result.corrected}'" def test_no_repair_common_pair(self, sc): """Don't repair if both words form a common pair.""" result = sc.correct_text("at the", "en") assert result.corrected == "at the" assert not result.changed def test_boundary_shift_right(self, sc): """Shift chars from word1 to word2.""" repair = sc._try_boundary_repair("ats", "th") assert repair == ("at", "sth") or repair == ("at", "sth"), f"Got {repair}" def test_boundary_shift_with_punct(self, sc): """Preserve punctuation during boundary repair.""" repair = sc._try_boundary_repair("ats", "th.") assert repair is not None assert repair[0] == "at" assert repair[1] == "sth." def test_pound_sand_to_pounds_and(self, sc): """'Pound sand' → 'Pounds and' — both valid but repair is much more frequent.""" result = sc.correct_text("Pound sand euros", "en") assert "Pounds and" in result.corrected, f"Expected 'Pounds and' in '{result.corrected}'" def test_wit_hit_to_with_it(self, sc): """'wit hit' → 'with it' — frequency-based repair.""" result = sc.correct_text("be careful wit hit", "en") assert "with it" in result.corrected, f"Expected 'with it' in '{result.corrected}'" def test_done_euro_to_one_euro(self, sc): """'done euro' → 'one euro' in context.""" result = sc.correct_text("done euro", "en") assert "one euro" in result.corrected, f"Expected 'one euro' in '{result.corrected}'" # ─── Context Split ────────────────────────────────────────────────────────── class TestContextSplit: def test_anew_to_a_new(self, sc): """'anew' → 'a new' when followed by a noun.""" result = sc.correct_text("anew book", "en") assert result.corrected == "a new book", f"Got '{result.corrected}'" def test_anew_standalone_no_split(self, sc): """'anew' at end of phrase might genuinely be 'anew'.""" # "start anew" — no next word to indicate split # This is ambiguous, so we accept either behavior pass def test_alive_not_split(self, sc): """'alive' should never be split to 'a live'.""" result = sc.correct_text("alive and well", "en") assert "alive" in result.corrected def test_alone_not_split(self, sc): """'alone' should never be split.""" result = sc.correct_text("alone in the dark", "en") assert "alone" in result.corrected def test_about_not_split(self, sc): """'about' should never be split to 'a bout'.""" result = sc.correct_text("about time", "en") assert "about" in result.corrected # ─── Vocab Entry Correction ───────────────────────────────────────────────── class TestVocabEntryCorrection: def test_basic_entry(self, sc): results = sc.correct_vocab_entry( english="beautful", german="schön", ) assert results["english"].corrected == "beautiful" assert results["german"].changed is False def test_umlaut_in_german(self, sc): results = sc.correct_vocab_entry( english="school", german="Schuler", ) assert results["english"].changed is False assert results["german"].corrected == "Schüler" def test_example_auto_detect(self, sc): results = sc.correct_vocab_entry( english="friend", german="Freund", example="My best freind lives in Berlin", ) assert "friend" in results["example"].corrected # ─── Speed ───────────────────────────────────────────────────────────────── class TestSpeed: def test_100_corrections_under_500ms(self, sc): """100 word corrections should complete in under 500ms.""" import time words = [ ("beautful", "en"), ("teh", "en"), ("freind", "en"), ("homwork", "en"), ("yesturday", "en"), ("Schuler", "de"), ("Bucher", "de"), ("Turkei", "de"), ("uber", "de"), ("Ubung", "de"), ] * 10 t0 = time.time() for word, lang in words: sc.correct_word(word, lang) dt = time.time() - t0 print(f"\n 100 corrections in {dt*1000:.0f}ms") assert dt < 0.5, f"Too slow: {dt*1000:.0f}ms"