breakpilot-lehrer/klausur-service/backend/tests/test_spell_benchmark.py

"""
Benchmark: Spell-checking & language detection approaches for OCR post-correction.

Tests pyspellchecker (already used), symspellpy (candidate), and
dual-dictionary language detection heuristic on real vocabulary OCR data.

Run:  pytest tests/test_spell_benchmark.py -v -s
"""

import time
import pytest


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _load_pyspellchecker():
    from spellchecker import SpellChecker
    en = SpellChecker(language='en', distance=1)
    de = SpellChecker(language='de', distance=1)
    return en, de


def _load_symspellpy():
    """Load symspellpy with English frequency dict (bundled)."""
    from symspellpy import SymSpell, Verbosity
    sym = SymSpell(max_dictionary_edit_distance=2)
    # Use bundled English frequency dict
    import pkg_resources
    dict_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
    sym.load_dictionary(dict_path, term_index=0, count_index=1)
    return sym, Verbosity


# ---------------------------------------------------------------------------
# Test data: (ocr_output, expected_correction, language, category)
# ---------------------------------------------------------------------------

OCR_TEST_CASES = [
    # --- Single-char ambiguity ---
    ("l am a student", "I am a student", "en", "a_vs_I"),
    ("a book", "a book", "en", "a_vs_I"),  # should NOT change
    ("I like cats", "I like cats", "en", "a_vs_I"),  # should NOT change
    ("lt is raining", "It is raining", "en", "a_vs_I"),  # l→I at start

    # --- Digit-letter confusion ---
    ("g0od", "good", "en", "digit_letter"),
    ("sch00l", "school", "en", "digit_letter"),
    ("he1lo", "hello", "en", "digit_letter"),
    ("Sch0n", "Schon", "de", "digit_letter"),  # German

    # --- Umlaut drops ---
    ("schon", "schön", "de", "umlaut"),  # context: "schon" is also valid DE!
    ("Schuler", "Schüler", "de", "umlaut"),
    ("uber", "über", "de", "umlaut"),
    ("Bucher", "Bücher", "de", "umlaut"),
    ("Turkei", "Türkei", "de", "umlaut"),

    # --- Common OCR errors ---
    ("beautful", "beautiful", "en", "missing_char"),
    ("teh", "the", "en", "transposition"),
    ("becasue", "because", "en", "transposition"),
    ("freind", "friend", "en", "swap"),
    ("Freund", "Freund", "de", "correct"),  # already correct

    # --- Merged words ---
    ("atmyschool", "at my school", "en", "merged"),
    ("goodidea", "good idea", "en", "merged"),

    # --- Mixed language example sentences ---
    ("I go to teh school", "I go to the school", "en", "sentence"),
    ("Ich gehe zur Schule", "Ich gehe zur Schule", "de", "sentence_correct"),
]

# Language detection test: (word, expected_language)
LANG_DETECT_CASES = [
    # Clear English
    ("school", "en"),
    ("beautiful", "en"),
    ("homework", "en"),
    ("yesterday", "en"),
    ("children", "en"),
    ("because", "en"),
    ("environment", "en"),
    ("although", "en"),

    # Clear German
    ("Schule", "de"),
    ("Hausaufgaben", "de"),
    ("Freundschaft", "de"),
    ("Umwelt", "de"),
    ("Kindergarten", "de"),  # also used in English!
    ("Bücher", "de"),
    ("Straße", "de"),
    ("Entschuldigung", "de"),

    # Ambiguous (exist in both)
    ("Hand", "both"),
    ("Finger", "both"),
    ("Arm", "both"),
    ("Name", "both"),
    ("Ball", "both"),

    # Short/tricky
    ("a", "en"),
    ("I", "en"),
    ("in", "both"),
    ("an", "both"),
    ("the", "en"),
    ("die", "de"),
    ("der", "de"),
    ("to", "en"),
    ("zu", "de"),
]


# ===========================================================================
# Tests
# ===========================================================================


class TestPyspellchecker:
    """Test pyspellchecker capabilities for OCR correction."""

    @pytest.fixture(autouse=True)
    def setup(self):
        self.en, self.de = _load_pyspellchecker()

    def test_known_words(self):
        """Verify basic dictionary lookup."""
        assert self.en.known(["school"])
        assert self.en.known(["beautiful"])
        assert self.de.known(["schule"])  # lowercase
        assert self.de.known(["freund"])
        # Not known
        assert not self.en.known(["xyzqwk"])
        assert not self.de.known(["xyzqwk"])

    def test_correction_quality(self):
        """Test correction suggestions for OCR errors."""
        results = []
        for ocr, expected, lang, category in OCR_TEST_CASES:
            if category in ("sentence", "sentence_correct", "merged", "a_vs_I"):
                continue  # skip multi-word cases

            spell = self.en if lang == "en" else self.de
            words = ocr.split()
            corrected = []
            for w in words:
                if spell.known([w.lower()]):
                    corrected.append(w)
                else:
                    fix = spell.correction(w.lower())
                    if fix and fix != w.lower():
                        # Preserve case
                        if w[0].isupper():
                            fix = fix[0].upper() + fix[1:]
                        corrected.append(fix)
                    else:
                        corrected.append(w)
            result = " ".join(corrected)
            ok = result == expected
            results.append((ocr, expected, result, ok, category))
            if not ok:
                print(f"  MISS: '{ocr}' → '{result}' (expected '{expected}') [{category}]")
            else:
                print(f"  OK:   '{ocr}' → '{result}' [{category}]")

        correct = sum(1 for *_, ok, _ in results if ok)
        total = len(results)
        print(f"\npyspellchecker: {correct}/{total} correct ({100*correct/total:.0f}%)")

    def test_language_detection_heuristic(self):
        """Test dual-dictionary language detection."""
        results = []
        for word, expected_lang in LANG_DETECT_CASES:
            w = word.lower()
            in_en = bool(self.en.known([w]))
            in_de = bool(self.de.known([w]))

            if in_en and in_de:
                detected = "both"
            elif in_en:
                detected = "en"
            elif in_de:
                detected = "de"
            else:
                detected = "unknown"

            ok = detected == expected_lang
            results.append((word, expected_lang, detected, ok))
            if not ok:
                print(f"  MISS: '{word}' → {detected} (expected {expected_lang})")
            else:
                print(f"  OK:   '{word}' → {detected}")

        correct = sum(1 for *_, ok in results if ok)
        total = len(results)
        print(f"\nLang detection heuristic: {correct}/{total} correct ({100*correct/total:.0f}%)")

    def test_umlaut_awareness(self):
        """Test if pyspellchecker suggests umlaut corrections."""
        # "Schuler" should suggest "Schüler"
        candidates = self.de.candidates("schuler")
        print(f"  'schuler' candidates: {candidates}")
        # "uber" should suggest "über"
        candidates_uber = self.de.candidates("uber")
        print(f"  'uber' candidates: {candidates_uber}")
        # "Turkei" should suggest "Türkei"
        candidates_turkei = self.de.candidates("turkei")
        print(f"  'turkei' candidates: {candidates_turkei}")

    def test_speed_100_words(self):
        """Measure correction speed for 100 words."""
        words_en = ["beautful", "teh", "becasue", "freind", "shcool",
                     "homwork", "yesturday", "chilren", "becuse", "enviroment"] * 10
        t0 = time.time()
        for w in words_en:
            self.en.correction(w)
        dt = time.time() - t0
        print(f"\n  pyspellchecker: 100 EN corrections in {dt*1000:.0f}ms")

        words_de = ["schuler", "bucher", "turkei", "strasze", "entschuldigung",
                     "kindergaten", "freumd", "hauaufgaben", "umwlt", "ubung"] * 10
        t0 = time.time()
        for w in words_de:
            self.de.correction(w)
        dt = time.time() - t0
        print(f"  pyspellchecker: 100 DE corrections in {dt*1000:.0f}ms")


class TestSymspellpy:
    """Test symspellpy as a faster alternative."""

    @pytest.fixture(autouse=True)
    def setup(self):
        try:
            self.sym, self.Verbosity = _load_symspellpy()
            self.available = True
        except (ImportError, FileNotFoundError) as e:
            self.available = False
            pytest.skip(f"symspellpy not installed: {e}")

    def test_correction_quality(self):
        """Test symspellpy corrections (EN only — no DE dict bundled)."""
        en_cases = [(o, e, c) for o, e, _, c in OCR_TEST_CASES
                    if _ == "en" and c not in ("sentence", "sentence_correct", "merged", "a_vs_I")]

        results = []
        for ocr, expected, category in en_cases:
            suggestions = self.sym.lookup(ocr.lower(), self.Verbosity.CLOSEST, max_edit_distance=2)
            if suggestions:
                fix = suggestions[0].term
                if ocr[0].isupper():
                    fix = fix[0].upper() + fix[1:]
                result = fix
            else:
                result = ocr

            ok = result == expected
            results.append((ocr, expected, result, ok, category))
            status = "OK" if ok else "MISS"
            print(f"  {status}: '{ocr}' → '{result}' (expected '{expected}') [{category}]")

        correct = sum(1 for *_, ok, _ in results if ok)
        total = len(results)
        print(f"\nsymspellpy EN: {correct}/{total} correct ({100*correct/total:.0f}%)")

    def test_speed_100_words(self):
        """Measure symspellpy correction speed for 100 words."""
        words = ["beautful", "teh", "becasue", "freind", "shcool",
                 "homwork", "yesturday", "chilren", "becuse", "enviroment"] * 10
        t0 = time.time()
        for w in words:
            self.sym.lookup(w, self.Verbosity.CLOSEST, max_edit_distance=2)
        dt = time.time() - t0
        print(f"\n  symspellpy: 100 EN corrections in {dt*1000:.0f}ms")

    def test_compound_segmentation(self):
        """Test symspellpy's word segmentation for merged words."""
        cases = [
            ("atmyschool", "at my school"),
            ("goodidea", "good idea"),
            ("makeadecision", "make a decision"),
        ]
        for merged, expected in cases:
            result = self.sym.word_segmentation(merged)
            ok = result.corrected_string == expected
            status = "OK" if ok else "MISS"
            print(f"  {status}: '{merged}' → '{result.corrected_string}' (expected '{expected}')")


class TestContextDisambiguation:
    """Test context-based disambiguation for a/I and similar cases."""

    @pytest.fixture(autouse=True)
    def setup(self):
        self.en, self.de = _load_pyspellchecker()

    def test_bigram_context(self):
        """Use simple bigram heuristic for a/I disambiguation.

        Approach: check if 'a <next_word>' or 'I <next_word>' is more
        common by checking if <next_word> is a noun (follows 'a') or
        verb (follows 'I').
        """
        # Common words that follow "I" (verbs)
        i_followers = {"am", "was", "have", "had", "do", "did", "will",
                       "would", "can", "could", "should", "shall", "may",
                       "might", "think", "know", "see", "want", "need",
                       "like", "love", "hate", "go", "went", "come",
                       "came", "say", "said", "get", "got", "make", "made",
                       "take", "took", "give", "gave", "tell", "told",
                       "feel", "felt", "find", "found", "believe", "hope",
                       "remember", "forget", "understand", "mean", "meant",
                       "don't", "didn't", "can't", "won't", "couldn't",
                       "shouldn't", "wouldn't", "haven't", "hadn't"}

        # Common words that follow "a" (nouns/adjectives)
        a_followers = {"lot", "few", "little", "bit", "good", "bad",
                       "big", "small", "great", "new", "old", "long",
                       "short", "man", "woman", "boy", "girl", "dog",
                       "cat", "book", "car", "house", "day", "year",
                       "nice", "beautiful", "large", "huge", "tiny"}

        def disambiguate_a_I(token: str, next_word: str) -> str:
            """Given an ambiguous 'a' or 'I' (or 'l'), pick the right one."""
            nw = next_word.lower()
            if nw in i_followers:
                return "I"
            if nw in a_followers:
                return "a"
            # Fallback: if next word is known verb → I, known adj/noun → a
            # For now, use a simple heuristic: lowercase → "a", uppercase first letter → "I"
            return token  # no change if uncertain

        cases = [
            ("l", "am", "I"),
            ("l", "was", "I"),
            ("l", "think", "I"),
            ("a", "book", "a"),
            ("a", "cat", "a"),
            ("a", "lot", "a"),
            ("l", "big", "a"),  # "a big ..."
            ("a", "have", "I"),  # "I have ..."
        ]

        results = []
        for token, next_word, expected in cases:
            result = disambiguate_a_I(token, next_word)
            ok = result == expected
            results.append((token, next_word, expected, result, ok))
            status = "OK" if ok else "MISS"
            print(f"  {status}: '{token} {next_word}...' → '{result}' (expected '{expected}')")

        correct = sum(1 for *_, ok in results if ok)
        total = len(results)
        print(f"\na/I disambiguation: {correct}/{total} correct ({100*correct/total:.0f}%)")


class TestLangDetectLibrary:
    """Test py3langid or langdetect if available."""

    def test_py3langid(self):
        try:
            import langid
        except ImportError:
            pytest.skip("langid not installed")

        sentences = [
            ("I go to school every day", "en"),
            ("Ich gehe jeden Tag zur Schule", "de"),
            ("The weather is nice today", "en"),
            ("Das Wetter ist heute schön", "de"),
            ("She likes to play football", "en"),
            ("Er spielt gerne Fußball", "de"),
        ]

        results = []
        for text, expected in sentences:
            lang, confidence = langid.classify(text)
            ok = lang == expected
            results.append(ok)
            status = "OK" if ok else "MISS"
            print(f"  {status}: '{text[:40]}...' → {lang} ({confidence:.2f}) (expected {expected})")

        correct = sum(results)
        print(f"\nlangid sentence detection: {correct}/{len(results)} correct")

    def test_langid_single_words(self):
        """langid on single words — expected to be unreliable."""
        try:
            import langid
        except ImportError:
            pytest.skip("langid not installed")

        words = [("school", "en"), ("Schule", "de"), ("book", "en"),
                 ("Buch", "de"), ("car", "en"), ("Auto", "de"),
                 ("a", "en"), ("I", "en"), ("der", "de"), ("the", "en")]

        results = []
        for word, expected in words:
            lang, conf = langid.classify(word)
            ok = lang == expected
            results.append(ok)
            status = "OK" if ok else "MISS"
            print(f"  {status}: '{word}' → {lang} ({conf:.2f}) (expected {expected})")

        correct = sum(results)
        print(f"\nlangid single-word: {correct}/{len(results)} correct")


class TestIntegratedApproach:
    """Test the combined approach: dict-heuristic for lang + spell correction."""

    @pytest.fixture(autouse=True)
    def setup(self):
        self.en, self.de = _load_pyspellchecker()

    def detect_language(self, word: str) -> str:
        """Dual-dict heuristic language detection."""
        w = word.lower()
        # Skip very short words — too ambiguous
        if len(w) <= 2:
            return "ambiguous"
        in_en = bool(self.en.known([w]))
        in_de = bool(self.de.known([w]))
        if in_en and in_de:
            return "both"
        if in_en:
            return "en"
        if in_de:
            return "de"
        return "unknown"

    def correct_word(self, word: str, expected_lang: str) -> str:
        """Correct a single word given the expected language."""
        w_lower = word.lower()
        spell = self.en if expected_lang == "en" else self.de

        # Already known
        if spell.known([w_lower]):
            return word

        # Also check the other language — might be fine
        other = self.de if expected_lang == "en" else self.en
        if other.known([w_lower]):
            return word  # valid in the other language

        # Try correction
        fix = spell.correction(w_lower)
        if fix and fix != w_lower:
            if word[0].isupper():
                fix = fix[0].upper() + fix[1:]
            return fix

        return word

    def test_full_pipeline(self):
        """Test: detect language → correct with appropriate dict."""
        vocab_entries = [
            # (english_col, german_col, expected_en, expected_de)
            ("beautful", "schön", "beautiful", "schön"),
            ("school", "Schule", "school", "Schule"),
            ("teh cat", "die Katze", "the cat", "die Katze"),
            ("freind", "Freund", "friend", "Freund"),
            ("homwork", "Hausaufgaben", "homework", "Hausaufgaben"),
            ("Schuler", "Schuler", "Schuler", "Schüler"),  # DE umlaut: Schüler
        ]

        en_correct = 0
        de_correct = 0
        total = len(vocab_entries)

        for en_ocr, de_ocr, exp_en, exp_de in vocab_entries:
            # Correct each word in the column
            en_words = en_ocr.split()
            de_words = de_ocr.split()
            en_fixed = " ".join(self.correct_word(w, "en") for w in en_words)
            de_fixed = " ".join(self.correct_word(w, "de") for w in de_words)

            en_ok = en_fixed == exp_en
            de_ok = de_fixed == exp_de
            en_correct += en_ok
            de_correct += de_ok

            en_status = "OK" if en_ok else "MISS"
            de_status = "OK" if de_ok else "MISS"
            print(f"  EN {en_status}: '{en_ocr}' → '{en_fixed}' (expected '{exp_en}')")
            print(f"  DE {de_status}: '{de_ocr}' → '{de_fixed}' (expected '{exp_de}')")

        print(f"\nEN corrections: {en_correct}/{total} correct")
        print(f"DE corrections: {de_correct}/{total} correct")