Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 45s
CI / test-go-edu-search (push) Successful in 43s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 36s
CI / test-nodejs-website (push) Successful in 37s
SmartSpellChecker (klausur-service): - Language-aware OCR post-correction without LLMs - Dual-dictionary heuristic for EN/DE language detection - Context-based a/I disambiguation via bigram lookup - Multi-digit substitution (sch00l→school) - Cross-language guard (don't false-correct DE words in EN column) - Umlaut correction (Schuler→Schüler, uber→über) - Integrated into spell_review_entries_sync() pipeline - 31 tests, 9ms/100 corrections Vocab-worksheet refactoring (studio-v2): - Split 2337-line page.tsx into 14 files - Custom hook useVocabWorksheet.ts (all state + logic) - 9 components in components/ directory - types.ts, constants.ts for shared definitions Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
495 lines
18 KiB
Python
495 lines
18 KiB
Python
"""
|
|
Benchmark: Spell-checking & language detection approaches for OCR post-correction.
|
|
|
|
Tests pyspellchecker (already used), symspellpy (candidate), and
|
|
dual-dictionary language detection heuristic on real vocabulary OCR data.
|
|
|
|
Run: pytest tests/test_spell_benchmark.py -v -s
|
|
"""
|
|
|
|
import time
|
|
import pytest
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _load_pyspellchecker():
|
|
from spellchecker import SpellChecker
|
|
en = SpellChecker(language='en', distance=1)
|
|
de = SpellChecker(language='de', distance=1)
|
|
return en, de
|
|
|
|
|
|
def _load_symspellpy():
|
|
"""Load symspellpy with English frequency dict (bundled)."""
|
|
from symspellpy import SymSpell, Verbosity
|
|
sym = SymSpell(max_dictionary_edit_distance=2)
|
|
# Use bundled English frequency dict
|
|
import pkg_resources
|
|
dict_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
|
|
sym.load_dictionary(dict_path, term_index=0, count_index=1)
|
|
return sym, Verbosity
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test data: (ocr_output, expected_correction, language, category)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
OCR_TEST_CASES = [
|
|
# --- Single-char ambiguity ---
|
|
("l am a student", "I am a student", "en", "a_vs_I"),
|
|
("a book", "a book", "en", "a_vs_I"), # should NOT change
|
|
("I like cats", "I like cats", "en", "a_vs_I"), # should NOT change
|
|
("lt is raining", "It is raining", "en", "a_vs_I"), # l→I at start
|
|
|
|
# --- Digit-letter confusion ---
|
|
("g0od", "good", "en", "digit_letter"),
|
|
("sch00l", "school", "en", "digit_letter"),
|
|
("he1lo", "hello", "en", "digit_letter"),
|
|
("Sch0n", "Schon", "de", "digit_letter"), # German
|
|
|
|
# --- Umlaut drops ---
|
|
("schon", "schön", "de", "umlaut"), # context: "schon" is also valid DE!
|
|
("Schuler", "Schüler", "de", "umlaut"),
|
|
("uber", "über", "de", "umlaut"),
|
|
("Bucher", "Bücher", "de", "umlaut"),
|
|
("Turkei", "Türkei", "de", "umlaut"),
|
|
|
|
# --- Common OCR errors ---
|
|
("beautful", "beautiful", "en", "missing_char"),
|
|
("teh", "the", "en", "transposition"),
|
|
("becasue", "because", "en", "transposition"),
|
|
("freind", "friend", "en", "swap"),
|
|
("Freund", "Freund", "de", "correct"), # already correct
|
|
|
|
# --- Merged words ---
|
|
("atmyschool", "at my school", "en", "merged"),
|
|
("goodidea", "good idea", "en", "merged"),
|
|
|
|
# --- Mixed language example sentences ---
|
|
("I go to teh school", "I go to the school", "en", "sentence"),
|
|
("Ich gehe zur Schule", "Ich gehe zur Schule", "de", "sentence_correct"),
|
|
]
|
|
|
|
# Language detection test: (word, expected_language)
|
|
LANG_DETECT_CASES = [
|
|
# Clear English
|
|
("school", "en"),
|
|
("beautiful", "en"),
|
|
("homework", "en"),
|
|
("yesterday", "en"),
|
|
("children", "en"),
|
|
("because", "en"),
|
|
("environment", "en"),
|
|
("although", "en"),
|
|
|
|
# Clear German
|
|
("Schule", "de"),
|
|
("Hausaufgaben", "de"),
|
|
("Freundschaft", "de"),
|
|
("Umwelt", "de"),
|
|
("Kindergarten", "de"), # also used in English!
|
|
("Bücher", "de"),
|
|
("Straße", "de"),
|
|
("Entschuldigung", "de"),
|
|
|
|
# Ambiguous (exist in both)
|
|
("Hand", "both"),
|
|
("Finger", "both"),
|
|
("Arm", "both"),
|
|
("Name", "both"),
|
|
("Ball", "both"),
|
|
|
|
# Short/tricky
|
|
("a", "en"),
|
|
("I", "en"),
|
|
("in", "both"),
|
|
("an", "both"),
|
|
("the", "en"),
|
|
("die", "de"),
|
|
("der", "de"),
|
|
("to", "en"),
|
|
("zu", "de"),
|
|
]
|
|
|
|
|
|
# ===========================================================================
|
|
# Tests
|
|
# ===========================================================================
|
|
|
|
|
|
class TestPyspellchecker:
|
|
"""Test pyspellchecker capabilities for OCR correction."""
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def setup(self):
|
|
self.en, self.de = _load_pyspellchecker()
|
|
|
|
def test_known_words(self):
|
|
"""Verify basic dictionary lookup."""
|
|
assert self.en.known(["school"])
|
|
assert self.en.known(["beautiful"])
|
|
assert self.de.known(["schule"]) # lowercase
|
|
assert self.de.known(["freund"])
|
|
# Not known
|
|
assert not self.en.known(["xyzqwk"])
|
|
assert not self.de.known(["xyzqwk"])
|
|
|
|
def test_correction_quality(self):
|
|
"""Test correction suggestions for OCR errors."""
|
|
results = []
|
|
for ocr, expected, lang, category in OCR_TEST_CASES:
|
|
if category in ("sentence", "sentence_correct", "merged", "a_vs_I"):
|
|
continue # skip multi-word cases
|
|
|
|
spell = self.en if lang == "en" else self.de
|
|
words = ocr.split()
|
|
corrected = []
|
|
for w in words:
|
|
if spell.known([w.lower()]):
|
|
corrected.append(w)
|
|
else:
|
|
fix = spell.correction(w.lower())
|
|
if fix and fix != w.lower():
|
|
# Preserve case
|
|
if w[0].isupper():
|
|
fix = fix[0].upper() + fix[1:]
|
|
corrected.append(fix)
|
|
else:
|
|
corrected.append(w)
|
|
result = " ".join(corrected)
|
|
ok = result == expected
|
|
results.append((ocr, expected, result, ok, category))
|
|
if not ok:
|
|
print(f" MISS: '{ocr}' → '{result}' (expected '{expected}') [{category}]")
|
|
else:
|
|
print(f" OK: '{ocr}' → '{result}' [{category}]")
|
|
|
|
correct = sum(1 for *_, ok, _ in results if ok)
|
|
total = len(results)
|
|
print(f"\npyspellchecker: {correct}/{total} correct ({100*correct/total:.0f}%)")
|
|
|
|
def test_language_detection_heuristic(self):
|
|
"""Test dual-dictionary language detection."""
|
|
results = []
|
|
for word, expected_lang in LANG_DETECT_CASES:
|
|
w = word.lower()
|
|
in_en = bool(self.en.known([w]))
|
|
in_de = bool(self.de.known([w]))
|
|
|
|
if in_en and in_de:
|
|
detected = "both"
|
|
elif in_en:
|
|
detected = "en"
|
|
elif in_de:
|
|
detected = "de"
|
|
else:
|
|
detected = "unknown"
|
|
|
|
ok = detected == expected_lang
|
|
results.append((word, expected_lang, detected, ok))
|
|
if not ok:
|
|
print(f" MISS: '{word}' → {detected} (expected {expected_lang})")
|
|
else:
|
|
print(f" OK: '{word}' → {detected}")
|
|
|
|
correct = sum(1 for *_, ok in results if ok)
|
|
total = len(results)
|
|
print(f"\nLang detection heuristic: {correct}/{total} correct ({100*correct/total:.0f}%)")
|
|
|
|
def test_umlaut_awareness(self):
|
|
"""Test if pyspellchecker suggests umlaut corrections."""
|
|
# "Schuler" should suggest "Schüler"
|
|
candidates = self.de.candidates("schuler")
|
|
print(f" 'schuler' candidates: {candidates}")
|
|
# "uber" should suggest "über"
|
|
candidates_uber = self.de.candidates("uber")
|
|
print(f" 'uber' candidates: {candidates_uber}")
|
|
# "Turkei" should suggest "Türkei"
|
|
candidates_turkei = self.de.candidates("turkei")
|
|
print(f" 'turkei' candidates: {candidates_turkei}")
|
|
|
|
def test_speed_100_words(self):
|
|
"""Measure correction speed for 100 words."""
|
|
words_en = ["beautful", "teh", "becasue", "freind", "shcool",
|
|
"homwork", "yesturday", "chilren", "becuse", "enviroment"] * 10
|
|
t0 = time.time()
|
|
for w in words_en:
|
|
self.en.correction(w)
|
|
dt = time.time() - t0
|
|
print(f"\n pyspellchecker: 100 EN corrections in {dt*1000:.0f}ms")
|
|
|
|
words_de = ["schuler", "bucher", "turkei", "strasze", "entschuldigung",
|
|
"kindergaten", "freumd", "hauaufgaben", "umwlt", "ubung"] * 10
|
|
t0 = time.time()
|
|
for w in words_de:
|
|
self.de.correction(w)
|
|
dt = time.time() - t0
|
|
print(f" pyspellchecker: 100 DE corrections in {dt*1000:.0f}ms")
|
|
|
|
|
|
class TestSymspellpy:
|
|
"""Test symspellpy as a faster alternative."""
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def setup(self):
|
|
try:
|
|
self.sym, self.Verbosity = _load_symspellpy()
|
|
self.available = True
|
|
except (ImportError, FileNotFoundError) as e:
|
|
self.available = False
|
|
pytest.skip(f"symspellpy not installed: {e}")
|
|
|
|
def test_correction_quality(self):
|
|
"""Test symspellpy corrections (EN only — no DE dict bundled)."""
|
|
en_cases = [(o, e, c) for o, e, _, c in OCR_TEST_CASES
|
|
if _ == "en" and c not in ("sentence", "sentence_correct", "merged", "a_vs_I")]
|
|
|
|
results = []
|
|
for ocr, expected, category in en_cases:
|
|
suggestions = self.sym.lookup(ocr.lower(), self.Verbosity.CLOSEST, max_edit_distance=2)
|
|
if suggestions:
|
|
fix = suggestions[0].term
|
|
if ocr[0].isupper():
|
|
fix = fix[0].upper() + fix[1:]
|
|
result = fix
|
|
else:
|
|
result = ocr
|
|
|
|
ok = result == expected
|
|
results.append((ocr, expected, result, ok, category))
|
|
status = "OK" if ok else "MISS"
|
|
print(f" {status}: '{ocr}' → '{result}' (expected '{expected}') [{category}]")
|
|
|
|
correct = sum(1 for *_, ok, _ in results if ok)
|
|
total = len(results)
|
|
print(f"\nsymspellpy EN: {correct}/{total} correct ({100*correct/total:.0f}%)")
|
|
|
|
def test_speed_100_words(self):
|
|
"""Measure symspellpy correction speed for 100 words."""
|
|
words = ["beautful", "teh", "becasue", "freind", "shcool",
|
|
"homwork", "yesturday", "chilren", "becuse", "enviroment"] * 10
|
|
t0 = time.time()
|
|
for w in words:
|
|
self.sym.lookup(w, self.Verbosity.CLOSEST, max_edit_distance=2)
|
|
dt = time.time() - t0
|
|
print(f"\n symspellpy: 100 EN corrections in {dt*1000:.0f}ms")
|
|
|
|
def test_compound_segmentation(self):
|
|
"""Test symspellpy's word segmentation for merged words."""
|
|
cases = [
|
|
("atmyschool", "at my school"),
|
|
("goodidea", "good idea"),
|
|
("makeadecision", "make a decision"),
|
|
]
|
|
for merged, expected in cases:
|
|
result = self.sym.word_segmentation(merged)
|
|
ok = result.corrected_string == expected
|
|
status = "OK" if ok else "MISS"
|
|
print(f" {status}: '{merged}' → '{result.corrected_string}' (expected '{expected}')")
|
|
|
|
|
|
class TestContextDisambiguation:
|
|
"""Test context-based disambiguation for a/I and similar cases."""
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def setup(self):
|
|
self.en, self.de = _load_pyspellchecker()
|
|
|
|
def test_bigram_context(self):
|
|
"""Use simple bigram heuristic for a/I disambiguation.
|
|
|
|
Approach: check if 'a <next_word>' or 'I <next_word>' is more
|
|
common by checking if <next_word> is a noun (follows 'a') or
|
|
verb (follows 'I').
|
|
"""
|
|
# Common words that follow "I" (verbs)
|
|
i_followers = {"am", "was", "have", "had", "do", "did", "will",
|
|
"would", "can", "could", "should", "shall", "may",
|
|
"might", "think", "know", "see", "want", "need",
|
|
"like", "love", "hate", "go", "went", "come",
|
|
"came", "say", "said", "get", "got", "make", "made",
|
|
"take", "took", "give", "gave", "tell", "told",
|
|
"feel", "felt", "find", "found", "believe", "hope",
|
|
"remember", "forget", "understand", "mean", "meant",
|
|
"don't", "didn't", "can't", "won't", "couldn't",
|
|
"shouldn't", "wouldn't", "haven't", "hadn't"}
|
|
|
|
# Common words that follow "a" (nouns/adjectives)
|
|
a_followers = {"lot", "few", "little", "bit", "good", "bad",
|
|
"big", "small", "great", "new", "old", "long",
|
|
"short", "man", "woman", "boy", "girl", "dog",
|
|
"cat", "book", "car", "house", "day", "year",
|
|
"nice", "beautiful", "large", "huge", "tiny"}
|
|
|
|
def disambiguate_a_I(token: str, next_word: str) -> str:
|
|
"""Given an ambiguous 'a' or 'I' (or 'l'), pick the right one."""
|
|
nw = next_word.lower()
|
|
if nw in i_followers:
|
|
return "I"
|
|
if nw in a_followers:
|
|
return "a"
|
|
# Fallback: if next word is known verb → I, known adj/noun → a
|
|
# For now, use a simple heuristic: lowercase → "a", uppercase first letter → "I"
|
|
return token # no change if uncertain
|
|
|
|
cases = [
|
|
("l", "am", "I"),
|
|
("l", "was", "I"),
|
|
("l", "think", "I"),
|
|
("a", "book", "a"),
|
|
("a", "cat", "a"),
|
|
("a", "lot", "a"),
|
|
("l", "big", "a"), # "a big ..."
|
|
("a", "have", "I"), # "I have ..."
|
|
]
|
|
|
|
results = []
|
|
for token, next_word, expected in cases:
|
|
result = disambiguate_a_I(token, next_word)
|
|
ok = result == expected
|
|
results.append((token, next_word, expected, result, ok))
|
|
status = "OK" if ok else "MISS"
|
|
print(f" {status}: '{token} {next_word}...' → '{result}' (expected '{expected}')")
|
|
|
|
correct = sum(1 for *_, ok in results if ok)
|
|
total = len(results)
|
|
print(f"\na/I disambiguation: {correct}/{total} correct ({100*correct/total:.0f}%)")
|
|
|
|
|
|
class TestLangDetectLibrary:
|
|
"""Test py3langid or langdetect if available."""
|
|
|
|
def test_py3langid(self):
|
|
try:
|
|
import langid
|
|
except ImportError:
|
|
pytest.skip("langid not installed")
|
|
|
|
sentences = [
|
|
("I go to school every day", "en"),
|
|
("Ich gehe jeden Tag zur Schule", "de"),
|
|
("The weather is nice today", "en"),
|
|
("Das Wetter ist heute schön", "de"),
|
|
("She likes to play football", "en"),
|
|
("Er spielt gerne Fußball", "de"),
|
|
]
|
|
|
|
results = []
|
|
for text, expected in sentences:
|
|
lang, confidence = langid.classify(text)
|
|
ok = lang == expected
|
|
results.append(ok)
|
|
status = "OK" if ok else "MISS"
|
|
print(f" {status}: '{text[:40]}...' → {lang} ({confidence:.2f}) (expected {expected})")
|
|
|
|
correct = sum(results)
|
|
print(f"\nlangid sentence detection: {correct}/{len(results)} correct")
|
|
|
|
def test_langid_single_words(self):
|
|
"""langid on single words — expected to be unreliable."""
|
|
try:
|
|
import langid
|
|
except ImportError:
|
|
pytest.skip("langid not installed")
|
|
|
|
words = [("school", "en"), ("Schule", "de"), ("book", "en"),
|
|
("Buch", "de"), ("car", "en"), ("Auto", "de"),
|
|
("a", "en"), ("I", "en"), ("der", "de"), ("the", "en")]
|
|
|
|
results = []
|
|
for word, expected in words:
|
|
lang, conf = langid.classify(word)
|
|
ok = lang == expected
|
|
results.append(ok)
|
|
status = "OK" if ok else "MISS"
|
|
print(f" {status}: '{word}' → {lang} ({conf:.2f}) (expected {expected})")
|
|
|
|
correct = sum(results)
|
|
print(f"\nlangid single-word: {correct}/{len(results)} correct")
|
|
|
|
|
|
class TestIntegratedApproach:
|
|
"""Test the combined approach: dict-heuristic for lang + spell correction."""
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def setup(self):
|
|
self.en, self.de = _load_pyspellchecker()
|
|
|
|
def detect_language(self, word: str) -> str:
|
|
"""Dual-dict heuristic language detection."""
|
|
w = word.lower()
|
|
# Skip very short words — too ambiguous
|
|
if len(w) <= 2:
|
|
return "ambiguous"
|
|
in_en = bool(self.en.known([w]))
|
|
in_de = bool(self.de.known([w]))
|
|
if in_en and in_de:
|
|
return "both"
|
|
if in_en:
|
|
return "en"
|
|
if in_de:
|
|
return "de"
|
|
return "unknown"
|
|
|
|
def correct_word(self, word: str, expected_lang: str) -> str:
|
|
"""Correct a single word given the expected language."""
|
|
w_lower = word.lower()
|
|
spell = self.en if expected_lang == "en" else self.de
|
|
|
|
# Already known
|
|
if spell.known([w_lower]):
|
|
return word
|
|
|
|
# Also check the other language — might be fine
|
|
other = self.de if expected_lang == "en" else self.en
|
|
if other.known([w_lower]):
|
|
return word # valid in the other language
|
|
|
|
# Try correction
|
|
fix = spell.correction(w_lower)
|
|
if fix and fix != w_lower:
|
|
if word[0].isupper():
|
|
fix = fix[0].upper() + fix[1:]
|
|
return fix
|
|
|
|
return word
|
|
|
|
def test_full_pipeline(self):
|
|
"""Test: detect language → correct with appropriate dict."""
|
|
vocab_entries = [
|
|
# (english_col, german_col, expected_en, expected_de)
|
|
("beautful", "schön", "beautiful", "schön"),
|
|
("school", "Schule", "school", "Schule"),
|
|
("teh cat", "die Katze", "the cat", "die Katze"),
|
|
("freind", "Freund", "friend", "Freund"),
|
|
("homwork", "Hausaufgaben", "homework", "Hausaufgaben"),
|
|
("Schuler", "Schuler", "Schuler", "Schüler"), # DE umlaut: Schüler
|
|
]
|
|
|
|
en_correct = 0
|
|
de_correct = 0
|
|
total = len(vocab_entries)
|
|
|
|
for en_ocr, de_ocr, exp_en, exp_de in vocab_entries:
|
|
# Correct each word in the column
|
|
en_words = en_ocr.split()
|
|
de_words = de_ocr.split()
|
|
en_fixed = " ".join(self.correct_word(w, "en") for w in en_words)
|
|
de_fixed = " ".join(self.correct_word(w, "de") for w in de_words)
|
|
|
|
en_ok = en_fixed == exp_en
|
|
de_ok = de_fixed == exp_de
|
|
en_correct += en_ok
|
|
de_correct += de_ok
|
|
|
|
en_status = "OK" if en_ok else "MISS"
|
|
de_status = "OK" if de_ok else "MISS"
|
|
print(f" EN {en_status}: '{en_ocr}' → '{en_fixed}' (expected '{exp_en}')")
|
|
print(f" DE {de_status}: '{de_ocr}' → '{de_fixed}' (expected '{exp_de}')")
|
|
|
|
print(f"\nEN corrections: {en_correct}/{total} correct")
|
|
print(f"DE corrections: {de_correct}/{total} correct")
|