breakpilot-lehrer/klausur-service/backend/tests/test_smart_spell.py

"""Tests for SmartSpellChecker — language-aware OCR post-correction."""

import pytest
import sys, os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from smart_spell import SmartSpellChecker, CorrectionResult


@pytest.fixture
def sc():
    return SmartSpellChecker()


# ─── Language Detection ──────────────────────────────────────────────────────


class TestLanguageDetection:

    def test_clear_english_words(self, sc):
        for word in ("school", "beautiful", "homework", "yesterday", "because"):
            assert sc.detect_word_lang(word) in ("en", "both"), f"{word} should be EN"

    def test_clear_german_words(self, sc):
        for word in ("Schule", "Hausaufgaben", "Freundschaft", "Straße", "Entschuldigung"):
            assert sc.detect_word_lang(word) in ("de", "both"), f"{word} should be DE"

    def test_ambiguous_words(self, sc):
        """Words that exist in both languages."""
        for word in ("Hand", "Finger", "Arm", "Name", "Ball"):
            assert sc.detect_word_lang(word) == "both", f"{word} should be 'both'"

    def test_unknown_words(self, sc):
        assert sc.detect_word_lang("xyzqwk") == "unknown"
        assert sc.detect_word_lang("") == "unknown"

    def test_english_sentence(self, sc):
        assert sc.detect_text_lang("I go to school every day") == "en"

    def test_german_sentence(self, sc):
        assert sc.detect_text_lang("Ich gehe jeden Tag zur Schule") == "de"

    def test_mixed_sentence(self, sc):
        # Dominant language should win
        lang = sc.detect_text_lang("I like to play Fußball with my Freunde")
        assert lang in ("en", "both")


# ─── Single Word Correction ────────────────────────────────────────────────


class TestSingleWordCorrection:

    def test_known_word_not_changed(self, sc):
        assert sc.correct_word("school", "en") is None
        assert sc.correct_word("Freund", "de") is None

    def test_digit_letter_single(self, sc):
        assert sc.correct_word("g0od", "en") == "good"
        assert sc.correct_word("he1lo", "en") == "hello"

    def test_digit_letter_multi(self, sc):
        """Multiple digit substitutions (e.g., sch00l)."""
        result = sc.correct_word("sch00l", "en")
        assert result == "school", f"Expected 'school', got '{result}'"

    def test_pipe_to_I(self, sc):
        assert sc.correct_word("|", "en") == "I"

    def test_umlaut_schuler(self, sc):
        assert sc.correct_word("Schuler", "de") == "Schüler"

    def test_umlaut_uber(self, sc):
        assert sc.correct_word("uber", "de") == "über"

    def test_umlaut_bucher(self, sc):
        assert sc.correct_word("Bucher", "de") == "Bücher"

    def test_umlaut_turkei(self, sc):
        assert sc.correct_word("Turkei", "de") == "Türkei"

    def test_missing_char(self, sc):
        assert sc.correct_word("beautful", "en") == "beautiful"

    def test_transposition(self, sc):
        assert sc.correct_word("teh", "en") == "the"

    def test_swap(self, sc):
        assert sc.correct_word("freind", "en") == "friend"

    def test_no_false_correction_cross_lang(self, sc):
        """Don't correct a word that's valid in the other language.

        'Schuler' in the EN column should NOT be corrected to 'Schuyler'
        because 'Schüler' is valid German — it's likely a German word
        that ended up in the wrong column (or is a surname).
        """
        # Schuler is valid DE (after umlaut fix → Schüler), so
        # in the EN column it should be left alone
        result = sc.correct_word("Schuler", "en")
        # Should either be None (no change) or not "Schuyler"
        assert result != "Schuyler", "Should not false-correct German word in EN column"


# ─── a/I Disambiguation ──────────────────────────────────────────────────────


class TestAIDisambiguation:

    def test_I_before_verb(self, sc):
        assert sc._disambiguate_a_I("l", "am") == "I"
        assert sc._disambiguate_a_I("l", "was") == "I"
        assert sc._disambiguate_a_I("l", "think") == "I"
        assert sc._disambiguate_a_I("l", "have") == "I"
        assert sc._disambiguate_a_I("l", "don't") == "I"

    def test_a_before_noun_adj(self, sc):
        assert sc._disambiguate_a_I("a", "book") == "a"
        assert sc._disambiguate_a_I("a", "cat") == "a"
        assert sc._disambiguate_a_I("a", "big") == "a"
        assert sc._disambiguate_a_I("a", "lot") == "a"

    def test_uncertain_returns_none(self, sc):
        """When context is ambiguous, return None (don't change)."""
        assert sc._disambiguate_a_I("l", "xyzqwk") is None


# ─── Full Text Correction ───────────────────────────────────────────────────


class TestFullTextCorrection:

    def test_english_sentence(self, sc):
        result = sc.correct_text("teh cat is beautful", "en")
        assert result.changed
        assert "the" in result.corrected
        assert "beautiful" in result.corrected

    def test_german_sentence_no_change(self, sc):
        result = sc.correct_text("Ich gehe zur Schule", "de")
        assert not result.changed

    def test_german_umlaut_fix(self, sc):
        result = sc.correct_text("Der Schuler liest Bucher", "de")
        assert "Schüler" in result.corrected
        assert "Bücher" in result.corrected

    def test_preserves_punctuation(self, sc):
        result = sc.correct_text("teh cat, beautful!", "en")
        assert "," in result.corrected
        assert "!" in result.corrected

    def test_empty_text(self, sc):
        result = sc.correct_text("", "en")
        assert not result.changed
        assert result.corrected == ""


# ─── Vocab Entry Correction ─────────────────────────────────────────────────


class TestVocabEntryCorrection:

    def test_basic_entry(self, sc):
        results = sc.correct_vocab_entry(
            english="beautful",
            german="schön",
        )
        assert results["english"].corrected == "beautiful"
        assert results["german"].changed is False

    def test_umlaut_in_german(self, sc):
        results = sc.correct_vocab_entry(
            english="school",
            german="Schuler",
        )
        assert results["english"].changed is False
        assert results["german"].corrected == "Schüler"

    def test_example_auto_detect(self, sc):
        results = sc.correct_vocab_entry(
            english="friend",
            german="Freund",
            example="My best freind lives in Berlin",
        )
        assert "friend" in results["example"].corrected


# ─── Speed ─────────────────────────────────────────────────────────────────


class TestSpeed:

    def test_100_corrections_under_500ms(self, sc):
        """100 word corrections should complete in under 500ms."""
        import time
        words = [
            ("beautful", "en"), ("teh", "en"), ("freind", "en"),
            ("homwork", "en"), ("yesturday", "en"),
            ("Schuler", "de"), ("Bucher", "de"), ("Turkei", "de"),
            ("uber", "de"), ("Ubung", "de"),
        ] * 10

        t0 = time.time()
        for word, lang in words:
            sc.correct_word(word, lang)
        dt = time.time() - t0

        print(f"\n  100 corrections in {dt*1000:.0f}ms")
        assert dt < 0.5, f"Too slow: {dt*1000:.0f}ms"