"""Tests for SmartSpellChecker — language-aware OCR post-correction.""" import pytest import sys, os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from smart_spell import SmartSpellChecker, CorrectionResult @pytest.fixture def sc(): return SmartSpellChecker() # ─── Language Detection ────────────────────────────────────────────────────── class TestLanguageDetection: def test_clear_english_words(self, sc): for word in ("school", "beautiful", "homework", "yesterday", "because"): assert sc.detect_word_lang(word) in ("en", "both"), f"{word} should be EN" def test_clear_german_words(self, sc): for word in ("Schule", "Hausaufgaben", "Freundschaft", "Straße", "Entschuldigung"): assert sc.detect_word_lang(word) in ("de", "both"), f"{word} should be DE" def test_ambiguous_words(self, sc): """Words that exist in both languages.""" for word in ("Hand", "Finger", "Arm", "Name", "Ball"): assert sc.detect_word_lang(word) == "both", f"{word} should be 'both'" def test_unknown_words(self, sc): assert sc.detect_word_lang("xyzqwk") == "unknown" assert sc.detect_word_lang("") == "unknown" def test_english_sentence(self, sc): assert sc.detect_text_lang("I go to school every day") == "en" def test_german_sentence(self, sc): assert sc.detect_text_lang("Ich gehe jeden Tag zur Schule") == "de" def test_mixed_sentence(self, sc): # Dominant language should win lang = sc.detect_text_lang("I like to play Fußball with my Freunde") assert lang in ("en", "both") # ─── Single Word Correction ──────────────────────────────────────────────── class TestSingleWordCorrection: def test_known_word_not_changed(self, sc): assert sc.correct_word("school", "en") is None assert sc.correct_word("Freund", "de") is None def test_digit_letter_single(self, sc): assert sc.correct_word("g0od", "en") == "good" assert sc.correct_word("he1lo", "en") == "hello" def test_digit_letter_multi(self, sc): """Multiple digit substitutions (e.g., sch00l).""" result = sc.correct_word("sch00l", "en") assert result == "school", f"Expected 'school', got '{result}'" def test_pipe_to_I(self, sc): assert sc.correct_word("|", "en") == "I" def test_umlaut_schuler(self, sc): assert sc.correct_word("Schuler", "de") == "Schüler" def test_umlaut_uber(self, sc): assert sc.correct_word("uber", "de") == "über" def test_umlaut_bucher(self, sc): assert sc.correct_word("Bucher", "de") == "Bücher" def test_umlaut_turkei(self, sc): assert sc.correct_word("Turkei", "de") == "Türkei" def test_missing_char(self, sc): assert sc.correct_word("beautful", "en") == "beautiful" def test_transposition(self, sc): assert sc.correct_word("teh", "en") == "the" def test_swap(self, sc): assert sc.correct_word("freind", "en") == "friend" def test_no_false_correction_cross_lang(self, sc): """Don't correct a word that's valid in the other language. 'Schuler' in the EN column should NOT be corrected to 'Schuyler' because 'Schüler' is valid German — it's likely a German word that ended up in the wrong column (or is a surname). """ # Schuler is valid DE (after umlaut fix → Schüler), so # in the EN column it should be left alone result = sc.correct_word("Schuler", "en") # Should either be None (no change) or not "Schuyler" assert result != "Schuyler", "Should not false-correct German word in EN column" # ─── a/I Disambiguation ────────────────────────────────────────────────────── class TestAIDisambiguation: def test_I_before_verb(self, sc): assert sc._disambiguate_a_I("l", "am") == "I" assert sc._disambiguate_a_I("l", "was") == "I" assert sc._disambiguate_a_I("l", "think") == "I" assert sc._disambiguate_a_I("l", "have") == "I" assert sc._disambiguate_a_I("l", "don't") == "I" def test_a_before_noun_adj(self, sc): assert sc._disambiguate_a_I("a", "book") == "a" assert sc._disambiguate_a_I("a", "cat") == "a" assert sc._disambiguate_a_I("a", "big") == "a" assert sc._disambiguate_a_I("a", "lot") == "a" def test_uncertain_returns_none(self, sc): """When context is ambiguous, return None (don't change).""" assert sc._disambiguate_a_I("l", "xyzqwk") is None # ─── Full Text Correction ─────────────────────────────────────────────────── class TestFullTextCorrection: def test_english_sentence(self, sc): result = sc.correct_text("teh cat is beautful", "en") assert result.changed assert "the" in result.corrected assert "beautiful" in result.corrected def test_german_sentence_no_change(self, sc): result = sc.correct_text("Ich gehe zur Schule", "de") assert not result.changed def test_german_umlaut_fix(self, sc): result = sc.correct_text("Der Schuler liest Bucher", "de") assert "Schüler" in result.corrected assert "Bücher" in result.corrected def test_preserves_punctuation(self, sc): result = sc.correct_text("teh cat, beautful!", "en") assert "," in result.corrected assert "!" in result.corrected def test_empty_text(self, sc): result = sc.correct_text("", "en") assert not result.changed assert result.corrected == "" # ─── Vocab Entry Correction ───────────────────────────────────────────────── class TestVocabEntryCorrection: def test_basic_entry(self, sc): results = sc.correct_vocab_entry( english="beautful", german="schön", ) assert results["english"].corrected == "beautiful" assert results["german"].changed is False def test_umlaut_in_german(self, sc): results = sc.correct_vocab_entry( english="school", german="Schuler", ) assert results["english"].changed is False assert results["german"].corrected == "Schüler" def test_example_auto_detect(self, sc): results = sc.correct_vocab_entry( english="friend", german="Freund", example="My best freind lives in Berlin", ) assert "friend" in results["example"].corrected # ─── Speed ───────────────────────────────────────────────────────────────── class TestSpeed: def test_100_corrections_under_500ms(self, sc): """100 word corrections should complete in under 500ms.""" import time words = [ ("beautful", "en"), ("teh", "en"), ("freind", "en"), ("homwork", "en"), ("yesturday", "en"), ("Schuler", "de"), ("Bucher", "de"), ("Turkei", "de"), ("uber", "de"), ("Ubung", "de"), ] * 10 t0 = time.time() for word, lang in words: sc.correct_word(word, lang) dt = time.time() - t0 print(f"\n 100 corrections in {dt*1000:.0f}ms") assert dt < 0.5, f"Too slow: {dt*1000:.0f}ms"