Files
breakpilot-lehrer/klausur-service/backend/tests/test_spell_benchmark.py
Benjamin Admin 909d0729f6
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 45s
CI / test-go-edu-search (push) Successful in 43s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 36s
CI / test-nodejs-website (push) Successful in 37s
Add SmartSpellChecker + refactor vocab-worksheet page.tsx
SmartSpellChecker (klausur-service):
- Language-aware OCR post-correction without LLMs
- Dual-dictionary heuristic for EN/DE language detection
- Context-based a/I disambiguation via bigram lookup
- Multi-digit substitution (sch00l→school)
- Cross-language guard (don't false-correct DE words in EN column)
- Umlaut correction (Schuler→Schüler, uber→über)
- Integrated into spell_review_entries_sync() pipeline
- 31 tests, 9ms/100 corrections

Vocab-worksheet refactoring (studio-v2):
- Split 2337-line page.tsx into 14 files
- Custom hook useVocabWorksheet.ts (all state + logic)
- 9 components in components/ directory
- types.ts, constants.ts for shared definitions

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 12:25:01 +02:00

495 lines
18 KiB
Python

"""
Benchmark: Spell-checking & language detection approaches for OCR post-correction.
Tests pyspellchecker (already used), symspellpy (candidate), and
dual-dictionary language detection heuristic on real vocabulary OCR data.
Run: pytest tests/test_spell_benchmark.py -v -s
"""
import time
import pytest
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _load_pyspellchecker():
from spellchecker import SpellChecker
en = SpellChecker(language='en', distance=1)
de = SpellChecker(language='de', distance=1)
return en, de
def _load_symspellpy():
"""Load symspellpy with English frequency dict (bundled)."""
from symspellpy import SymSpell, Verbosity
sym = SymSpell(max_dictionary_edit_distance=2)
# Use bundled English frequency dict
import pkg_resources
dict_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
sym.load_dictionary(dict_path, term_index=0, count_index=1)
return sym, Verbosity
# ---------------------------------------------------------------------------
# Test data: (ocr_output, expected_correction, language, category)
# ---------------------------------------------------------------------------
OCR_TEST_CASES = [
# --- Single-char ambiguity ---
("l am a student", "I am a student", "en", "a_vs_I"),
("a book", "a book", "en", "a_vs_I"), # should NOT change
("I like cats", "I like cats", "en", "a_vs_I"), # should NOT change
("lt is raining", "It is raining", "en", "a_vs_I"), # l→I at start
# --- Digit-letter confusion ---
("g0od", "good", "en", "digit_letter"),
("sch00l", "school", "en", "digit_letter"),
("he1lo", "hello", "en", "digit_letter"),
("Sch0n", "Schon", "de", "digit_letter"), # German
# --- Umlaut drops ---
("schon", "schön", "de", "umlaut"), # context: "schon" is also valid DE!
("Schuler", "Schüler", "de", "umlaut"),
("uber", "über", "de", "umlaut"),
("Bucher", "Bücher", "de", "umlaut"),
("Turkei", "Türkei", "de", "umlaut"),
# --- Common OCR errors ---
("beautful", "beautiful", "en", "missing_char"),
("teh", "the", "en", "transposition"),
("becasue", "because", "en", "transposition"),
("freind", "friend", "en", "swap"),
("Freund", "Freund", "de", "correct"), # already correct
# --- Merged words ---
("atmyschool", "at my school", "en", "merged"),
("goodidea", "good idea", "en", "merged"),
# --- Mixed language example sentences ---
("I go to teh school", "I go to the school", "en", "sentence"),
("Ich gehe zur Schule", "Ich gehe zur Schule", "de", "sentence_correct"),
]
# Language detection test: (word, expected_language)
LANG_DETECT_CASES = [
# Clear English
("school", "en"),
("beautiful", "en"),
("homework", "en"),
("yesterday", "en"),
("children", "en"),
("because", "en"),
("environment", "en"),
("although", "en"),
# Clear German
("Schule", "de"),
("Hausaufgaben", "de"),
("Freundschaft", "de"),
("Umwelt", "de"),
("Kindergarten", "de"), # also used in English!
("Bücher", "de"),
("Straße", "de"),
("Entschuldigung", "de"),
# Ambiguous (exist in both)
("Hand", "both"),
("Finger", "both"),
("Arm", "both"),
("Name", "both"),
("Ball", "both"),
# Short/tricky
("a", "en"),
("I", "en"),
("in", "both"),
("an", "both"),
("the", "en"),
("die", "de"),
("der", "de"),
("to", "en"),
("zu", "de"),
]
# ===========================================================================
# Tests
# ===========================================================================
class TestPyspellchecker:
"""Test pyspellchecker capabilities for OCR correction."""
@pytest.fixture(autouse=True)
def setup(self):
self.en, self.de = _load_pyspellchecker()
def test_known_words(self):
"""Verify basic dictionary lookup."""
assert self.en.known(["school"])
assert self.en.known(["beautiful"])
assert self.de.known(["schule"]) # lowercase
assert self.de.known(["freund"])
# Not known
assert not self.en.known(["xyzqwk"])
assert not self.de.known(["xyzqwk"])
def test_correction_quality(self):
"""Test correction suggestions for OCR errors."""
results = []
for ocr, expected, lang, category in OCR_TEST_CASES:
if category in ("sentence", "sentence_correct", "merged", "a_vs_I"):
continue # skip multi-word cases
spell = self.en if lang == "en" else self.de
words = ocr.split()
corrected = []
for w in words:
if spell.known([w.lower()]):
corrected.append(w)
else:
fix = spell.correction(w.lower())
if fix and fix != w.lower():
# Preserve case
if w[0].isupper():
fix = fix[0].upper() + fix[1:]
corrected.append(fix)
else:
corrected.append(w)
result = " ".join(corrected)
ok = result == expected
results.append((ocr, expected, result, ok, category))
if not ok:
print(f" MISS: '{ocr}''{result}' (expected '{expected}') [{category}]")
else:
print(f" OK: '{ocr}''{result}' [{category}]")
correct = sum(1 for *_, ok, _ in results if ok)
total = len(results)
print(f"\npyspellchecker: {correct}/{total} correct ({100*correct/total:.0f}%)")
def test_language_detection_heuristic(self):
"""Test dual-dictionary language detection."""
results = []
for word, expected_lang in LANG_DETECT_CASES:
w = word.lower()
in_en = bool(self.en.known([w]))
in_de = bool(self.de.known([w]))
if in_en and in_de:
detected = "both"
elif in_en:
detected = "en"
elif in_de:
detected = "de"
else:
detected = "unknown"
ok = detected == expected_lang
results.append((word, expected_lang, detected, ok))
if not ok:
print(f" MISS: '{word}'{detected} (expected {expected_lang})")
else:
print(f" OK: '{word}'{detected}")
correct = sum(1 for *_, ok in results if ok)
total = len(results)
print(f"\nLang detection heuristic: {correct}/{total} correct ({100*correct/total:.0f}%)")
def test_umlaut_awareness(self):
"""Test if pyspellchecker suggests umlaut corrections."""
# "Schuler" should suggest "Schüler"
candidates = self.de.candidates("schuler")
print(f" 'schuler' candidates: {candidates}")
# "uber" should suggest "über"
candidates_uber = self.de.candidates("uber")
print(f" 'uber' candidates: {candidates_uber}")
# "Turkei" should suggest "Türkei"
candidates_turkei = self.de.candidates("turkei")
print(f" 'turkei' candidates: {candidates_turkei}")
def test_speed_100_words(self):
"""Measure correction speed for 100 words."""
words_en = ["beautful", "teh", "becasue", "freind", "shcool",
"homwork", "yesturday", "chilren", "becuse", "enviroment"] * 10
t0 = time.time()
for w in words_en:
self.en.correction(w)
dt = time.time() - t0
print(f"\n pyspellchecker: 100 EN corrections in {dt*1000:.0f}ms")
words_de = ["schuler", "bucher", "turkei", "strasze", "entschuldigung",
"kindergaten", "freumd", "hauaufgaben", "umwlt", "ubung"] * 10
t0 = time.time()
for w in words_de:
self.de.correction(w)
dt = time.time() - t0
print(f" pyspellchecker: 100 DE corrections in {dt*1000:.0f}ms")
class TestSymspellpy:
"""Test symspellpy as a faster alternative."""
@pytest.fixture(autouse=True)
def setup(self):
try:
self.sym, self.Verbosity = _load_symspellpy()
self.available = True
except (ImportError, FileNotFoundError) as e:
self.available = False
pytest.skip(f"symspellpy not installed: {e}")
def test_correction_quality(self):
"""Test symspellpy corrections (EN only — no DE dict bundled)."""
en_cases = [(o, e, c) for o, e, _, c in OCR_TEST_CASES
if _ == "en" and c not in ("sentence", "sentence_correct", "merged", "a_vs_I")]
results = []
for ocr, expected, category in en_cases:
suggestions = self.sym.lookup(ocr.lower(), self.Verbosity.CLOSEST, max_edit_distance=2)
if suggestions:
fix = suggestions[0].term
if ocr[0].isupper():
fix = fix[0].upper() + fix[1:]
result = fix
else:
result = ocr
ok = result == expected
results.append((ocr, expected, result, ok, category))
status = "OK" if ok else "MISS"
print(f" {status}: '{ocr}''{result}' (expected '{expected}') [{category}]")
correct = sum(1 for *_, ok, _ in results if ok)
total = len(results)
print(f"\nsymspellpy EN: {correct}/{total} correct ({100*correct/total:.0f}%)")
def test_speed_100_words(self):
"""Measure symspellpy correction speed for 100 words."""
words = ["beautful", "teh", "becasue", "freind", "shcool",
"homwork", "yesturday", "chilren", "becuse", "enviroment"] * 10
t0 = time.time()
for w in words:
self.sym.lookup(w, self.Verbosity.CLOSEST, max_edit_distance=2)
dt = time.time() - t0
print(f"\n symspellpy: 100 EN corrections in {dt*1000:.0f}ms")
def test_compound_segmentation(self):
"""Test symspellpy's word segmentation for merged words."""
cases = [
("atmyschool", "at my school"),
("goodidea", "good idea"),
("makeadecision", "make a decision"),
]
for merged, expected in cases:
result = self.sym.word_segmentation(merged)
ok = result.corrected_string == expected
status = "OK" if ok else "MISS"
print(f" {status}: '{merged}''{result.corrected_string}' (expected '{expected}')")
class TestContextDisambiguation:
"""Test context-based disambiguation for a/I and similar cases."""
@pytest.fixture(autouse=True)
def setup(self):
self.en, self.de = _load_pyspellchecker()
def test_bigram_context(self):
"""Use simple bigram heuristic for a/I disambiguation.
Approach: check if 'a <next_word>' or 'I <next_word>' is more
common by checking if <next_word> is a noun (follows 'a') or
verb (follows 'I').
"""
# Common words that follow "I" (verbs)
i_followers = {"am", "was", "have", "had", "do", "did", "will",
"would", "can", "could", "should", "shall", "may",
"might", "think", "know", "see", "want", "need",
"like", "love", "hate", "go", "went", "come",
"came", "say", "said", "get", "got", "make", "made",
"take", "took", "give", "gave", "tell", "told",
"feel", "felt", "find", "found", "believe", "hope",
"remember", "forget", "understand", "mean", "meant",
"don't", "didn't", "can't", "won't", "couldn't",
"shouldn't", "wouldn't", "haven't", "hadn't"}
# Common words that follow "a" (nouns/adjectives)
a_followers = {"lot", "few", "little", "bit", "good", "bad",
"big", "small", "great", "new", "old", "long",
"short", "man", "woman", "boy", "girl", "dog",
"cat", "book", "car", "house", "day", "year",
"nice", "beautiful", "large", "huge", "tiny"}
def disambiguate_a_I(token: str, next_word: str) -> str:
"""Given an ambiguous 'a' or 'I' (or 'l'), pick the right one."""
nw = next_word.lower()
if nw in i_followers:
return "I"
if nw in a_followers:
return "a"
# Fallback: if next word is known verb → I, known adj/noun → a
# For now, use a simple heuristic: lowercase → "a", uppercase first letter → "I"
return token # no change if uncertain
cases = [
("l", "am", "I"),
("l", "was", "I"),
("l", "think", "I"),
("a", "book", "a"),
("a", "cat", "a"),
("a", "lot", "a"),
("l", "big", "a"), # "a big ..."
("a", "have", "I"), # "I have ..."
]
results = []
for token, next_word, expected in cases:
result = disambiguate_a_I(token, next_word)
ok = result == expected
results.append((token, next_word, expected, result, ok))
status = "OK" if ok else "MISS"
print(f" {status}: '{token} {next_word}...''{result}' (expected '{expected}')")
correct = sum(1 for *_, ok in results if ok)
total = len(results)
print(f"\na/I disambiguation: {correct}/{total} correct ({100*correct/total:.0f}%)")
class TestLangDetectLibrary:
"""Test py3langid or langdetect if available."""
def test_py3langid(self):
try:
import langid
except ImportError:
pytest.skip("langid not installed")
sentences = [
("I go to school every day", "en"),
("Ich gehe jeden Tag zur Schule", "de"),
("The weather is nice today", "en"),
("Das Wetter ist heute schön", "de"),
("She likes to play football", "en"),
("Er spielt gerne Fußball", "de"),
]
results = []
for text, expected in sentences:
lang, confidence = langid.classify(text)
ok = lang == expected
results.append(ok)
status = "OK" if ok else "MISS"
print(f" {status}: '{text[:40]}...'{lang} ({confidence:.2f}) (expected {expected})")
correct = sum(results)
print(f"\nlangid sentence detection: {correct}/{len(results)} correct")
def test_langid_single_words(self):
"""langid on single words — expected to be unreliable."""
try:
import langid
except ImportError:
pytest.skip("langid not installed")
words = [("school", "en"), ("Schule", "de"), ("book", "en"),
("Buch", "de"), ("car", "en"), ("Auto", "de"),
("a", "en"), ("I", "en"), ("der", "de"), ("the", "en")]
results = []
for word, expected in words:
lang, conf = langid.classify(word)
ok = lang == expected
results.append(ok)
status = "OK" if ok else "MISS"
print(f" {status}: '{word}'{lang} ({conf:.2f}) (expected {expected})")
correct = sum(results)
print(f"\nlangid single-word: {correct}/{len(results)} correct")
class TestIntegratedApproach:
"""Test the combined approach: dict-heuristic for lang + spell correction."""
@pytest.fixture(autouse=True)
def setup(self):
self.en, self.de = _load_pyspellchecker()
def detect_language(self, word: str) -> str:
"""Dual-dict heuristic language detection."""
w = word.lower()
# Skip very short words — too ambiguous
if len(w) <= 2:
return "ambiguous"
in_en = bool(self.en.known([w]))
in_de = bool(self.de.known([w]))
if in_en and in_de:
return "both"
if in_en:
return "en"
if in_de:
return "de"
return "unknown"
def correct_word(self, word: str, expected_lang: str) -> str:
"""Correct a single word given the expected language."""
w_lower = word.lower()
spell = self.en if expected_lang == "en" else self.de
# Already known
if spell.known([w_lower]):
return word
# Also check the other language — might be fine
other = self.de if expected_lang == "en" else self.en
if other.known([w_lower]):
return word # valid in the other language
# Try correction
fix = spell.correction(w_lower)
if fix and fix != w_lower:
if word[0].isupper():
fix = fix[0].upper() + fix[1:]
return fix
return word
def test_full_pipeline(self):
"""Test: detect language → correct with appropriate dict."""
vocab_entries = [
# (english_col, german_col, expected_en, expected_de)
("beautful", "schön", "beautiful", "schön"),
("school", "Schule", "school", "Schule"),
("teh cat", "die Katze", "the cat", "die Katze"),
("freind", "Freund", "friend", "Freund"),
("homwork", "Hausaufgaben", "homework", "Hausaufgaben"),
("Schuler", "Schuler", "Schuler", "Schüler"), # DE umlaut: Schüler
]
en_correct = 0
de_correct = 0
total = len(vocab_entries)
for en_ocr, de_ocr, exp_en, exp_de in vocab_entries:
# Correct each word in the column
en_words = en_ocr.split()
de_words = de_ocr.split()
en_fixed = " ".join(self.correct_word(w, "en") for w in en_words)
de_fixed = " ".join(self.correct_word(w, "de") for w in de_words)
en_ok = en_fixed == exp_en
de_ok = de_fixed == exp_de
en_correct += en_ok
de_correct += de_ok
en_status = "OK" if en_ok else "MISS"
de_status = "OK" if de_ok else "MISS"
print(f" EN {en_status}: '{en_ocr}''{en_fixed}' (expected '{exp_en}')")
print(f" DE {de_status}: '{de_ocr}''{de_fixed}' (expected '{exp_de}')")
print(f"\nEN corrections: {en_correct}/{total} correct")
print(f"DE corrections: {de_correct}/{total} correct")