Add SmartSpellChecker + refactor vocab-worksheet page.tsx
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 45s
CI / test-go-edu-search (push) Successful in 43s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 36s
CI / test-nodejs-website (push) Successful in 37s

SmartSpellChecker (klausur-service):
- Language-aware OCR post-correction without LLMs
- Dual-dictionary heuristic for EN/DE language detection
- Context-based a/I disambiguation via bigram lookup
- Multi-digit substitution (sch00l→school)
- Cross-language guard (don't false-correct DE words in EN column)
- Umlaut correction (Schuler→Schüler, uber→über)
- Integrated into spell_review_entries_sync() pipeline
- 31 tests, 9ms/100 corrections

Vocab-worksheet refactoring (studio-v2):
- Split 2337-line page.tsx into 14 files
- Custom hook useVocabWorksheet.ts (all state + logic)
- 9 components in components/ directory
- types.ts, constants.ts for shared definitions

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-12 12:25:01 +02:00
parent 04fa01661c
commit 909d0729f6
17 changed files with 3545 additions and 2228 deletions

View File

@@ -881,10 +881,25 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
"""Rule-based OCR correction: spell-checker + structural heuristics.
Deterministic — never translates, never touches IPA, never hallucinates.
Uses SmartSpellChecker for language-aware corrections with context-based
disambiguation (a/I), multi-digit substitution, and cross-language guard.
"""
t0 = time.time()
changes: List[Dict] = []
all_corrected: List[Dict] = []
# Use SmartSpellChecker if available, fall back to legacy _spell_fix_field
_smart = None
try:
from smart_spell import SmartSpellChecker
_smart = SmartSpellChecker()
logger.debug("spell_review: using SmartSpellChecker")
except Exception:
logger.debug("spell_review: SmartSpellChecker not available, using legacy")
# Map field names → language codes for SmartSpellChecker
_LANG_MAP = {"english": "en", "german": "de", "example": "auto"}
for i, entry in enumerate(entries):
e = dict(entry)
# Page-ref normalization (always, regardless of review status)
@@ -907,9 +922,18 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
old_val = (e.get(field_name) or "").strip()
if not old_val:
continue
# example field is mixed-language — try German first (for umlauts)
lang = "german" if field_name in ("german", "example") else "english"
new_val, was_changed = _spell_fix_field(old_val, field=lang)
if _smart:
# SmartSpellChecker path — language-aware, context-based
lang_code = _LANG_MAP.get(field_name, "en")
result = _smart.correct_text(old_val, lang=lang_code)
new_val = result.corrected
was_changed = result.changed
else:
# Legacy path
lang = "german" if field_name in ("german", "example") else "english"
new_val, was_changed = _spell_fix_field(old_val, field=lang)
if was_changed and new_val != old_val:
changes.append({
"row_index": e.get("row_index", i),
@@ -921,12 +945,13 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
e["llm_corrected"] = True
all_corrected.append(e)
duration_ms = int((time.time() - t0) * 1000)
model_name = "smart-spell-checker" if _smart else "spell-checker"
return {
"entries_original": entries,
"entries_corrected": all_corrected,
"changes": changes,
"skipped_count": 0,
"model_used": "spell-checker",
"model_used": model_name,
"duration_ms": duration_ms,
}

View File

@@ -0,0 +1,369 @@
"""
SmartSpellChecker — Language-aware OCR post-correction without LLMs.
Uses pyspellchecker (MIT) with dual EN+DE dictionaries for:
- Automatic language detection per word (dual-dictionary heuristic)
- OCR error correction (digit↔letter, umlauts, transpositions)
- Context-based disambiguation (a/I, l/I) via bigram lookup
- Mixed-language support for example sentences
Lizenz: Apache 2.0 (kommerziell nutzbar)
"""
import logging
import re
from dataclasses import dataclass, field
from typing import Dict, List, Literal, Optional, Set, Tuple
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Init
# ---------------------------------------------------------------------------
try:
from spellchecker import SpellChecker as _SpellChecker
_en_spell = _SpellChecker(language='en', distance=1)
_de_spell = _SpellChecker(language='de', distance=1)
_AVAILABLE = True
except ImportError:
_AVAILABLE = False
logger.warning("pyspellchecker not installed — SmartSpellChecker disabled")
Lang = Literal["en", "de", "both", "unknown"]
# ---------------------------------------------------------------------------
# Bigram context for a/I disambiguation
# ---------------------------------------------------------------------------
# Words that commonly follow "I" (subject pronoun → verb/modal)
_I_FOLLOWERS: frozenset = frozenset({
"am", "was", "have", "had", "do", "did", "will", "would", "can",
"could", "should", "shall", "may", "might", "must",
"think", "know", "see", "want", "need", "like", "love", "hate",
"go", "went", "come", "came", "say", "said", "get", "got",
"make", "made", "take", "took", "give", "gave", "tell", "told",
"feel", "felt", "find", "found", "believe", "hope", "wish",
"remember", "forget", "understand", "mean", "meant",
"don't", "didn't", "can't", "won't", "couldn't", "wouldn't",
"shouldn't", "haven't", "hadn't", "isn't", "wasn't",
"really", "just", "also", "always", "never", "often", "sometimes",
})
# Words that commonly follow "a" (article → noun/adjective)
_A_FOLLOWERS: frozenset = frozenset({
"lot", "few", "little", "bit", "good", "bad", "great", "new", "old",
"long", "short", "big", "small", "large", "huge", "tiny",
"nice", "beautiful", "wonderful", "terrible", "horrible",
"man", "woman", "boy", "girl", "child", "dog", "cat", "bird",
"book", "car", "house", "room", "school", "teacher", "student",
"day", "week", "month", "year", "time", "place", "way",
"friend", "family", "person", "problem", "question", "story",
"very", "really", "quite", "rather", "pretty", "single",
})
# Digit→letter substitutions (OCR confusion)
_DIGIT_SUBS: Dict[str, List[str]] = {
'0': ['o', 'O'],
'1': ['l', 'I'],
'5': ['s', 'S'],
'6': ['g', 'G'],
'8': ['b', 'B'],
'|': ['I', 'l'],
}
_SUSPICIOUS_CHARS = frozenset(_DIGIT_SUBS.keys())
# Umlaut confusion: OCR drops dots (ü→u, ä→a, ö→o)
_UMLAUT_MAP = {
'a': 'ä', 'o': 'ö', 'u': 'ü', 'i': 'ü',
'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü',
}
# Tokenizer
_TOKEN_RE = re.compile(r"([A-Za-zÄÖÜäöüß'|]+)([^A-Za-zÄÖÜäöüß'|]*)")
# ---------------------------------------------------------------------------
# Data types
# ---------------------------------------------------------------------------
@dataclass
class CorrectionResult:
original: str
corrected: str
lang_detected: Lang
changed: bool
changes: List[str] = field(default_factory=list)
# ---------------------------------------------------------------------------
# Core class
# ---------------------------------------------------------------------------
class SmartSpellChecker:
"""Language-aware OCR spell checker using pyspellchecker (no LLM)."""
def __init__(self):
if not _AVAILABLE:
raise RuntimeError("pyspellchecker not installed")
self.en = _en_spell
self.de = _de_spell
# --- Language detection ---
def detect_word_lang(self, word: str) -> Lang:
"""Detect language of a single word using dual-dict heuristic."""
w = word.lower().strip(".,;:!?\"'()")
if not w:
return "unknown"
in_en = bool(self.en.known([w]))
in_de = bool(self.de.known([w]))
if in_en and in_de:
return "both"
if in_en:
return "en"
if in_de:
return "de"
return "unknown"
def detect_text_lang(self, text: str) -> Lang:
"""Detect dominant language of a text string (sentence/phrase)."""
words = re.findall(r"[A-Za-zÄÖÜäöüß]+", text)
if not words:
return "unknown"
en_count = 0
de_count = 0
for w in words:
lang = self.detect_word_lang(w)
if lang == "en":
en_count += 1
elif lang == "de":
de_count += 1
# "both" doesn't count for either
if en_count > de_count:
return "en"
if de_count > en_count:
return "de"
if en_count == de_count and en_count > 0:
return "both"
return "unknown"
# --- Single-word correction ---
def _known(self, word: str) -> bool:
"""True if word is known in EN or DE dictionary."""
w = word.lower()
return bool(self.en.known([w])) or bool(self.de.known([w]))
def _known_in(self, word: str, lang: str) -> bool:
"""True if word is known in a specific language dictionary."""
w = word.lower()
spell = self.en if lang == "en" else self.de
return bool(spell.known([w]))
def correct_word(self, word: str, lang: str = "en",
prev_word: str = "", next_word: str = "") -> Optional[str]:
"""Correct a single word for the given language.
Returns None if no correction needed, or the corrected string.
Args:
word: The word to check/correct
lang: Expected language ("en" or "de")
prev_word: Previous word (for context)
next_word: Next word (for context)
"""
if not word or not word.strip():
return None
# Skip numbers, abbreviations with dots, very short tokens
if word.isdigit() or '.' in word:
return None
has_suspicious = any(ch in _SUSPICIOUS_CHARS for ch in word)
# 1. Already known → no fix
if self._known(word):
# But check a/I disambiguation for single-char words
if word.lower() in ('l', '|') and next_word:
return self._disambiguate_a_I(word, next_word)
return None
# 2. Digit/pipe substitution
if has_suspicious:
if word == '|':
return 'I'
# Try single-char substitutions
for i, ch in enumerate(word):
if ch not in _DIGIT_SUBS:
continue
for replacement in _DIGIT_SUBS[ch]:
candidate = word[:i] + replacement + word[i + 1:]
if self._known(candidate):
return candidate
# Try multi-char substitution (e.g., "sch00l" → "school")
multi = self._try_multi_digit_sub(word)
if multi:
return multi
# 3. Umlaut correction (German)
if lang == "de" and len(word) >= 3 and word.isalpha():
umlaut_fix = self._try_umlaut_fix(word)
if umlaut_fix:
return umlaut_fix
# 4. General spell correction
if not has_suspicious and len(word) >= 3 and word.isalpha():
# Safety: don't correct if the word is valid in the OTHER language
# (either directly or via umlaut fix)
other_lang = "de" if lang == "en" else "en"
if self._known_in(word, other_lang):
return None
if other_lang == "de" and self._try_umlaut_fix(word):
return None # has a valid DE umlaut variant → don't touch
spell = self.en if lang == "en" else self.de
correction = spell.correction(word.lower())
if correction and correction != word.lower():
if word[0].isupper():
correction = correction[0].upper() + correction[1:]
if self._known(correction):
return correction
return None
# --- Multi-digit substitution ---
def _try_multi_digit_sub(self, word: str) -> Optional[str]:
"""Try replacing multiple digits simultaneously."""
positions = [(i, ch) for i, ch in enumerate(word) if ch in _DIGIT_SUBS]
if len(positions) < 1 or len(positions) > 4:
return None
# Try all combinations (max 2^4 = 16 for 4 positions)
chars = list(word)
best = None
self._multi_sub_recurse(chars, positions, 0, best_result=[None])
return self._multi_sub_recurse_result
_multi_sub_recurse_result: Optional[str] = None
def _try_multi_digit_sub(self, word: str) -> Optional[str]:
"""Try replacing multiple digits simultaneously using BFS."""
positions = [(i, ch) for i, ch in enumerate(word) if ch in _DIGIT_SUBS]
if not positions or len(positions) > 4:
return None
# BFS over substitution combinations
queue = [list(word)]
for pos, ch in positions:
next_queue = []
for current in queue:
# Keep original
next_queue.append(current[:])
# Try each substitution
for repl in _DIGIT_SUBS[ch]:
variant = current[:]
variant[pos] = repl
next_queue.append(variant)
queue = next_queue
# Check which combinations produce known words
for combo in queue:
candidate = "".join(combo)
if candidate != word and self._known(candidate):
return candidate
return None
# --- Umlaut fix ---
def _try_umlaut_fix(self, word: str) -> Optional[str]:
"""Try single-char umlaut substitutions for German words."""
for i, ch in enumerate(word):
if ch in _UMLAUT_MAP:
candidate = word[:i] + _UMLAUT_MAP[ch] + word[i + 1:]
if self._known(candidate):
return candidate
return None
# --- a/I disambiguation ---
def _disambiguate_a_I(self, token: str, next_word: str) -> Optional[str]:
"""Disambiguate 'a' vs 'I' (and OCR variants like 'l', '|')."""
nw = next_word.lower().strip(".,;:!?")
if nw in _I_FOLLOWERS:
return "I"
if nw in _A_FOLLOWERS:
return "a"
# Fallback: check if next word is more commonly a verb (→I) or noun/adj (→a)
# Simple heuristic: if next word starts with uppercase (and isn't first in sentence)
# it's likely a German noun following "I"... but in English context, uppercase
# after "I" is unusual.
return None # uncertain, don't change
# --- Full text correction ---
def correct_text(self, text: str, lang: str = "en") -> CorrectionResult:
"""Correct a full text string (field value).
Args:
text: The text to correct
lang: Expected language ("en" or "de")
"""
if not text or not text.strip():
return CorrectionResult(text, text, "unknown", False)
detected = self.detect_text_lang(text) if lang == "auto" else lang
parts: List[str] = []
changes: List[str] = []
tokens = list(_TOKEN_RE.finditer(text))
for idx, m in enumerate(tokens):
token, sep = m.group(1), m.group(2)
next_word = tokens[idx + 1].group(1) if idx + 1 < len(tokens) else ""
prev_word = tokens[idx - 1].group(1) if idx > 0 else ""
correction = self.correct_word(
token, lang=detected if detected in ("en", "de") else "en",
prev_word=prev_word, next_word=next_word,
)
if correction and correction != token:
changes.append(f"{token}{correction}")
parts.append(correction)
else:
parts.append(token)
parts.append(sep)
# Append any trailing text
last_end = tokens[-1].end() if tokens else 0
if last_end < len(text):
parts.append(text[last_end:])
corrected = "".join(parts)
return CorrectionResult(
original=text,
corrected=corrected,
lang_detected=detected,
changed=corrected != text,
changes=changes,
)
# --- Vocabulary entry correction ---
def correct_vocab_entry(self, english: str, german: str,
example: str = "") -> Dict[str, CorrectionResult]:
"""Correct a full vocabulary entry (EN + DE + example).
Uses column position to determine language — the most reliable signal.
"""
results = {}
results["english"] = self.correct_text(english, lang="en")
results["german"] = self.correct_text(german, lang="de")
if example:
# For examples, auto-detect language
results["example"] = self.correct_text(example, lang="auto")
return results

View File

@@ -0,0 +1,210 @@
"""Tests for SmartSpellChecker — language-aware OCR post-correction."""
import pytest
import sys, os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from smart_spell import SmartSpellChecker, CorrectionResult
@pytest.fixture
def sc():
return SmartSpellChecker()
# ─── Language Detection ──────────────────────────────────────────────────────
class TestLanguageDetection:
def test_clear_english_words(self, sc):
for word in ("school", "beautiful", "homework", "yesterday", "because"):
assert sc.detect_word_lang(word) in ("en", "both"), f"{word} should be EN"
def test_clear_german_words(self, sc):
for word in ("Schule", "Hausaufgaben", "Freundschaft", "Straße", "Entschuldigung"):
assert sc.detect_word_lang(word) in ("de", "both"), f"{word} should be DE"
def test_ambiguous_words(self, sc):
"""Words that exist in both languages."""
for word in ("Hand", "Finger", "Arm", "Name", "Ball"):
assert sc.detect_word_lang(word) == "both", f"{word} should be 'both'"
def test_unknown_words(self, sc):
assert sc.detect_word_lang("xyzqwk") == "unknown"
assert sc.detect_word_lang("") == "unknown"
def test_english_sentence(self, sc):
assert sc.detect_text_lang("I go to school every day") == "en"
def test_german_sentence(self, sc):
assert sc.detect_text_lang("Ich gehe jeden Tag zur Schule") == "de"
def test_mixed_sentence(self, sc):
# Dominant language should win
lang = sc.detect_text_lang("I like to play Fußball with my Freunde")
assert lang in ("en", "both")
# ─── Single Word Correction ────────────────────────────────────────────────
class TestSingleWordCorrection:
def test_known_word_not_changed(self, sc):
assert sc.correct_word("school", "en") is None
assert sc.correct_word("Freund", "de") is None
def test_digit_letter_single(self, sc):
assert sc.correct_word("g0od", "en") == "good"
assert sc.correct_word("he1lo", "en") == "hello"
def test_digit_letter_multi(self, sc):
"""Multiple digit substitutions (e.g., sch00l)."""
result = sc.correct_word("sch00l", "en")
assert result == "school", f"Expected 'school', got '{result}'"
def test_pipe_to_I(self, sc):
assert sc.correct_word("|", "en") == "I"
def test_umlaut_schuler(self, sc):
assert sc.correct_word("Schuler", "de") == "Schüler"
def test_umlaut_uber(self, sc):
assert sc.correct_word("uber", "de") == "über"
def test_umlaut_bucher(self, sc):
assert sc.correct_word("Bucher", "de") == "Bücher"
def test_umlaut_turkei(self, sc):
assert sc.correct_word("Turkei", "de") == "Türkei"
def test_missing_char(self, sc):
assert sc.correct_word("beautful", "en") == "beautiful"
def test_transposition(self, sc):
assert sc.correct_word("teh", "en") == "the"
def test_swap(self, sc):
assert sc.correct_word("freind", "en") == "friend"
def test_no_false_correction_cross_lang(self, sc):
"""Don't correct a word that's valid in the other language.
'Schuler' in the EN column should NOT be corrected to 'Schuyler'
because 'Schüler' is valid German — it's likely a German word
that ended up in the wrong column (or is a surname).
"""
# Schuler is valid DE (after umlaut fix → Schüler), so
# in the EN column it should be left alone
result = sc.correct_word("Schuler", "en")
# Should either be None (no change) or not "Schuyler"
assert result != "Schuyler", "Should not false-correct German word in EN column"
# ─── a/I Disambiguation ──────────────────────────────────────────────────────
class TestAIDisambiguation:
def test_I_before_verb(self, sc):
assert sc._disambiguate_a_I("l", "am") == "I"
assert sc._disambiguate_a_I("l", "was") == "I"
assert sc._disambiguate_a_I("l", "think") == "I"
assert sc._disambiguate_a_I("l", "have") == "I"
assert sc._disambiguate_a_I("l", "don't") == "I"
def test_a_before_noun_adj(self, sc):
assert sc._disambiguate_a_I("a", "book") == "a"
assert sc._disambiguate_a_I("a", "cat") == "a"
assert sc._disambiguate_a_I("a", "big") == "a"
assert sc._disambiguate_a_I("a", "lot") == "a"
def test_uncertain_returns_none(self, sc):
"""When context is ambiguous, return None (don't change)."""
assert sc._disambiguate_a_I("l", "xyzqwk") is None
# ─── Full Text Correction ───────────────────────────────────────────────────
class TestFullTextCorrection:
def test_english_sentence(self, sc):
result = sc.correct_text("teh cat is beautful", "en")
assert result.changed
assert "the" in result.corrected
assert "beautiful" in result.corrected
def test_german_sentence_no_change(self, sc):
result = sc.correct_text("Ich gehe zur Schule", "de")
assert not result.changed
def test_german_umlaut_fix(self, sc):
result = sc.correct_text("Der Schuler liest Bucher", "de")
assert "Schüler" in result.corrected
assert "Bücher" in result.corrected
def test_preserves_punctuation(self, sc):
result = sc.correct_text("teh cat, beautful!", "en")
assert "," in result.corrected
assert "!" in result.corrected
def test_empty_text(self, sc):
result = sc.correct_text("", "en")
assert not result.changed
assert result.corrected == ""
# ─── Vocab Entry Correction ─────────────────────────────────────────────────
class TestVocabEntryCorrection:
def test_basic_entry(self, sc):
results = sc.correct_vocab_entry(
english="beautful",
german="schön",
)
assert results["english"].corrected == "beautiful"
assert results["german"].changed is False
def test_umlaut_in_german(self, sc):
results = sc.correct_vocab_entry(
english="school",
german="Schuler",
)
assert results["english"].changed is False
assert results["german"].corrected == "Schüler"
def test_example_auto_detect(self, sc):
results = sc.correct_vocab_entry(
english="friend",
german="Freund",
example="My best freind lives in Berlin",
)
assert "friend" in results["example"].corrected
# ─── Speed ─────────────────────────────────────────────────────────────────
class TestSpeed:
def test_100_corrections_under_500ms(self, sc):
"""100 word corrections should complete in under 500ms."""
import time
words = [
("beautful", "en"), ("teh", "en"), ("freind", "en"),
("homwork", "en"), ("yesturday", "en"),
("Schuler", "de"), ("Bucher", "de"), ("Turkei", "de"),
("uber", "de"), ("Ubung", "de"),
] * 10
t0 = time.time()
for word, lang in words:
sc.correct_word(word, lang)
dt = time.time() - t0
print(f"\n 100 corrections in {dt*1000:.0f}ms")
assert dt < 0.5, f"Too slow: {dt*1000:.0f}ms"

View File

@@ -0,0 +1,494 @@
"""
Benchmark: Spell-checking & language detection approaches for OCR post-correction.
Tests pyspellchecker (already used), symspellpy (candidate), and
dual-dictionary language detection heuristic on real vocabulary OCR data.
Run: pytest tests/test_spell_benchmark.py -v -s
"""
import time
import pytest
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _load_pyspellchecker():
from spellchecker import SpellChecker
en = SpellChecker(language='en', distance=1)
de = SpellChecker(language='de', distance=1)
return en, de
def _load_symspellpy():
"""Load symspellpy with English frequency dict (bundled)."""
from symspellpy import SymSpell, Verbosity
sym = SymSpell(max_dictionary_edit_distance=2)
# Use bundled English frequency dict
import pkg_resources
dict_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
sym.load_dictionary(dict_path, term_index=0, count_index=1)
return sym, Verbosity
# ---------------------------------------------------------------------------
# Test data: (ocr_output, expected_correction, language, category)
# ---------------------------------------------------------------------------
OCR_TEST_CASES = [
# --- Single-char ambiguity ---
("l am a student", "I am a student", "en", "a_vs_I"),
("a book", "a book", "en", "a_vs_I"), # should NOT change
("I like cats", "I like cats", "en", "a_vs_I"), # should NOT change
("lt is raining", "It is raining", "en", "a_vs_I"), # l→I at start
# --- Digit-letter confusion ---
("g0od", "good", "en", "digit_letter"),
("sch00l", "school", "en", "digit_letter"),
("he1lo", "hello", "en", "digit_letter"),
("Sch0n", "Schon", "de", "digit_letter"), # German
# --- Umlaut drops ---
("schon", "schön", "de", "umlaut"), # context: "schon" is also valid DE!
("Schuler", "Schüler", "de", "umlaut"),
("uber", "über", "de", "umlaut"),
("Bucher", "Bücher", "de", "umlaut"),
("Turkei", "Türkei", "de", "umlaut"),
# --- Common OCR errors ---
("beautful", "beautiful", "en", "missing_char"),
("teh", "the", "en", "transposition"),
("becasue", "because", "en", "transposition"),
("freind", "friend", "en", "swap"),
("Freund", "Freund", "de", "correct"), # already correct
# --- Merged words ---
("atmyschool", "at my school", "en", "merged"),
("goodidea", "good idea", "en", "merged"),
# --- Mixed language example sentences ---
("I go to teh school", "I go to the school", "en", "sentence"),
("Ich gehe zur Schule", "Ich gehe zur Schule", "de", "sentence_correct"),
]
# Language detection test: (word, expected_language)
LANG_DETECT_CASES = [
# Clear English
("school", "en"),
("beautiful", "en"),
("homework", "en"),
("yesterday", "en"),
("children", "en"),
("because", "en"),
("environment", "en"),
("although", "en"),
# Clear German
("Schule", "de"),
("Hausaufgaben", "de"),
("Freundschaft", "de"),
("Umwelt", "de"),
("Kindergarten", "de"), # also used in English!
("Bücher", "de"),
("Straße", "de"),
("Entschuldigung", "de"),
# Ambiguous (exist in both)
("Hand", "both"),
("Finger", "both"),
("Arm", "both"),
("Name", "both"),
("Ball", "both"),
# Short/tricky
("a", "en"),
("I", "en"),
("in", "both"),
("an", "both"),
("the", "en"),
("die", "de"),
("der", "de"),
("to", "en"),
("zu", "de"),
]
# ===========================================================================
# Tests
# ===========================================================================
class TestPyspellchecker:
"""Test pyspellchecker capabilities for OCR correction."""
@pytest.fixture(autouse=True)
def setup(self):
self.en, self.de = _load_pyspellchecker()
def test_known_words(self):
"""Verify basic dictionary lookup."""
assert self.en.known(["school"])
assert self.en.known(["beautiful"])
assert self.de.known(["schule"]) # lowercase
assert self.de.known(["freund"])
# Not known
assert not self.en.known(["xyzqwk"])
assert not self.de.known(["xyzqwk"])
def test_correction_quality(self):
"""Test correction suggestions for OCR errors."""
results = []
for ocr, expected, lang, category in OCR_TEST_CASES:
if category in ("sentence", "sentence_correct", "merged", "a_vs_I"):
continue # skip multi-word cases
spell = self.en if lang == "en" else self.de
words = ocr.split()
corrected = []
for w in words:
if spell.known([w.lower()]):
corrected.append(w)
else:
fix = spell.correction(w.lower())
if fix and fix != w.lower():
# Preserve case
if w[0].isupper():
fix = fix[0].upper() + fix[1:]
corrected.append(fix)
else:
corrected.append(w)
result = " ".join(corrected)
ok = result == expected
results.append((ocr, expected, result, ok, category))
if not ok:
print(f" MISS: '{ocr}''{result}' (expected '{expected}') [{category}]")
else:
print(f" OK: '{ocr}''{result}' [{category}]")
correct = sum(1 for *_, ok, _ in results if ok)
total = len(results)
print(f"\npyspellchecker: {correct}/{total} correct ({100*correct/total:.0f}%)")
def test_language_detection_heuristic(self):
"""Test dual-dictionary language detection."""
results = []
for word, expected_lang in LANG_DETECT_CASES:
w = word.lower()
in_en = bool(self.en.known([w]))
in_de = bool(self.de.known([w]))
if in_en and in_de:
detected = "both"
elif in_en:
detected = "en"
elif in_de:
detected = "de"
else:
detected = "unknown"
ok = detected == expected_lang
results.append((word, expected_lang, detected, ok))
if not ok:
print(f" MISS: '{word}'{detected} (expected {expected_lang})")
else:
print(f" OK: '{word}'{detected}")
correct = sum(1 for *_, ok in results if ok)
total = len(results)
print(f"\nLang detection heuristic: {correct}/{total} correct ({100*correct/total:.0f}%)")
def test_umlaut_awareness(self):
"""Test if pyspellchecker suggests umlaut corrections."""
# "Schuler" should suggest "Schüler"
candidates = self.de.candidates("schuler")
print(f" 'schuler' candidates: {candidates}")
# "uber" should suggest "über"
candidates_uber = self.de.candidates("uber")
print(f" 'uber' candidates: {candidates_uber}")
# "Turkei" should suggest "Türkei"
candidates_turkei = self.de.candidates("turkei")
print(f" 'turkei' candidates: {candidates_turkei}")
def test_speed_100_words(self):
"""Measure correction speed for 100 words."""
words_en = ["beautful", "teh", "becasue", "freind", "shcool",
"homwork", "yesturday", "chilren", "becuse", "enviroment"] * 10
t0 = time.time()
for w in words_en:
self.en.correction(w)
dt = time.time() - t0
print(f"\n pyspellchecker: 100 EN corrections in {dt*1000:.0f}ms")
words_de = ["schuler", "bucher", "turkei", "strasze", "entschuldigung",
"kindergaten", "freumd", "hauaufgaben", "umwlt", "ubung"] * 10
t0 = time.time()
for w in words_de:
self.de.correction(w)
dt = time.time() - t0
print(f" pyspellchecker: 100 DE corrections in {dt*1000:.0f}ms")
class TestSymspellpy:
"""Test symspellpy as a faster alternative."""
@pytest.fixture(autouse=True)
def setup(self):
try:
self.sym, self.Verbosity = _load_symspellpy()
self.available = True
except (ImportError, FileNotFoundError) as e:
self.available = False
pytest.skip(f"symspellpy not installed: {e}")
def test_correction_quality(self):
"""Test symspellpy corrections (EN only — no DE dict bundled)."""
en_cases = [(o, e, c) for o, e, _, c in OCR_TEST_CASES
if _ == "en" and c not in ("sentence", "sentence_correct", "merged", "a_vs_I")]
results = []
for ocr, expected, category in en_cases:
suggestions = self.sym.lookup(ocr.lower(), self.Verbosity.CLOSEST, max_edit_distance=2)
if suggestions:
fix = suggestions[0].term
if ocr[0].isupper():
fix = fix[0].upper() + fix[1:]
result = fix
else:
result = ocr
ok = result == expected
results.append((ocr, expected, result, ok, category))
status = "OK" if ok else "MISS"
print(f" {status}: '{ocr}''{result}' (expected '{expected}') [{category}]")
correct = sum(1 for *_, ok, _ in results if ok)
total = len(results)
print(f"\nsymspellpy EN: {correct}/{total} correct ({100*correct/total:.0f}%)")
def test_speed_100_words(self):
"""Measure symspellpy correction speed for 100 words."""
words = ["beautful", "teh", "becasue", "freind", "shcool",
"homwork", "yesturday", "chilren", "becuse", "enviroment"] * 10
t0 = time.time()
for w in words:
self.sym.lookup(w, self.Verbosity.CLOSEST, max_edit_distance=2)
dt = time.time() - t0
print(f"\n symspellpy: 100 EN corrections in {dt*1000:.0f}ms")
def test_compound_segmentation(self):
"""Test symspellpy's word segmentation for merged words."""
cases = [
("atmyschool", "at my school"),
("goodidea", "good idea"),
("makeadecision", "make a decision"),
]
for merged, expected in cases:
result = self.sym.word_segmentation(merged)
ok = result.corrected_string == expected
status = "OK" if ok else "MISS"
print(f" {status}: '{merged}''{result.corrected_string}' (expected '{expected}')")
class TestContextDisambiguation:
"""Test context-based disambiguation for a/I and similar cases."""
@pytest.fixture(autouse=True)
def setup(self):
self.en, self.de = _load_pyspellchecker()
def test_bigram_context(self):
"""Use simple bigram heuristic for a/I disambiguation.
Approach: check if 'a <next_word>' or 'I <next_word>' is more
common by checking if <next_word> is a noun (follows 'a') or
verb (follows 'I').
"""
# Common words that follow "I" (verbs)
i_followers = {"am", "was", "have", "had", "do", "did", "will",
"would", "can", "could", "should", "shall", "may",
"might", "think", "know", "see", "want", "need",
"like", "love", "hate", "go", "went", "come",
"came", "say", "said", "get", "got", "make", "made",
"take", "took", "give", "gave", "tell", "told",
"feel", "felt", "find", "found", "believe", "hope",
"remember", "forget", "understand", "mean", "meant",
"don't", "didn't", "can't", "won't", "couldn't",
"shouldn't", "wouldn't", "haven't", "hadn't"}
# Common words that follow "a" (nouns/adjectives)
a_followers = {"lot", "few", "little", "bit", "good", "bad",
"big", "small", "great", "new", "old", "long",
"short", "man", "woman", "boy", "girl", "dog",
"cat", "book", "car", "house", "day", "year",
"nice", "beautiful", "large", "huge", "tiny"}
def disambiguate_a_I(token: str, next_word: str) -> str:
"""Given an ambiguous 'a' or 'I' (or 'l'), pick the right one."""
nw = next_word.lower()
if nw in i_followers:
return "I"
if nw in a_followers:
return "a"
# Fallback: if next word is known verb → I, known adj/noun → a
# For now, use a simple heuristic: lowercase → "a", uppercase first letter → "I"
return token # no change if uncertain
cases = [
("l", "am", "I"),
("l", "was", "I"),
("l", "think", "I"),
("a", "book", "a"),
("a", "cat", "a"),
("a", "lot", "a"),
("l", "big", "a"), # "a big ..."
("a", "have", "I"), # "I have ..."
]
results = []
for token, next_word, expected in cases:
result = disambiguate_a_I(token, next_word)
ok = result == expected
results.append((token, next_word, expected, result, ok))
status = "OK" if ok else "MISS"
print(f" {status}: '{token} {next_word}...''{result}' (expected '{expected}')")
correct = sum(1 for *_, ok in results if ok)
total = len(results)
print(f"\na/I disambiguation: {correct}/{total} correct ({100*correct/total:.0f}%)")
class TestLangDetectLibrary:
"""Test py3langid or langdetect if available."""
def test_py3langid(self):
try:
import langid
except ImportError:
pytest.skip("langid not installed")
sentences = [
("I go to school every day", "en"),
("Ich gehe jeden Tag zur Schule", "de"),
("The weather is nice today", "en"),
("Das Wetter ist heute schön", "de"),
("She likes to play football", "en"),
("Er spielt gerne Fußball", "de"),
]
results = []
for text, expected in sentences:
lang, confidence = langid.classify(text)
ok = lang == expected
results.append(ok)
status = "OK" if ok else "MISS"
print(f" {status}: '{text[:40]}...'{lang} ({confidence:.2f}) (expected {expected})")
correct = sum(results)
print(f"\nlangid sentence detection: {correct}/{len(results)} correct")
def test_langid_single_words(self):
"""langid on single words — expected to be unreliable."""
try:
import langid
except ImportError:
pytest.skip("langid not installed")
words = [("school", "en"), ("Schule", "de"), ("book", "en"),
("Buch", "de"), ("car", "en"), ("Auto", "de"),
("a", "en"), ("I", "en"), ("der", "de"), ("the", "en")]
results = []
for word, expected in words:
lang, conf = langid.classify(word)
ok = lang == expected
results.append(ok)
status = "OK" if ok else "MISS"
print(f" {status}: '{word}'{lang} ({conf:.2f}) (expected {expected})")
correct = sum(results)
print(f"\nlangid single-word: {correct}/{len(results)} correct")
class TestIntegratedApproach:
"""Test the combined approach: dict-heuristic for lang + spell correction."""
@pytest.fixture(autouse=True)
def setup(self):
self.en, self.de = _load_pyspellchecker()
def detect_language(self, word: str) -> str:
"""Dual-dict heuristic language detection."""
w = word.lower()
# Skip very short words — too ambiguous
if len(w) <= 2:
return "ambiguous"
in_en = bool(self.en.known([w]))
in_de = bool(self.de.known([w]))
if in_en and in_de:
return "both"
if in_en:
return "en"
if in_de:
return "de"
return "unknown"
def correct_word(self, word: str, expected_lang: str) -> str:
"""Correct a single word given the expected language."""
w_lower = word.lower()
spell = self.en if expected_lang == "en" else self.de
# Already known
if spell.known([w_lower]):
return word
# Also check the other language — might be fine
other = self.de if expected_lang == "en" else self.en
if other.known([w_lower]):
return word # valid in the other language
# Try correction
fix = spell.correction(w_lower)
if fix and fix != w_lower:
if word[0].isupper():
fix = fix[0].upper() + fix[1:]
return fix
return word
def test_full_pipeline(self):
"""Test: detect language → correct with appropriate dict."""
vocab_entries = [
# (english_col, german_col, expected_en, expected_de)
("beautful", "schön", "beautiful", "schön"),
("school", "Schule", "school", "Schule"),
("teh cat", "die Katze", "the cat", "die Katze"),
("freind", "Freund", "friend", "Freund"),
("homwork", "Hausaufgaben", "homework", "Hausaufgaben"),
("Schuler", "Schuler", "Schuler", "Schüler"), # DE umlaut: Schüler
]
en_correct = 0
de_correct = 0
total = len(vocab_entries)
for en_ocr, de_ocr, exp_en, exp_de in vocab_entries:
# Correct each word in the column
en_words = en_ocr.split()
de_words = de_ocr.split()
en_fixed = " ".join(self.correct_word(w, "en") for w in en_words)
de_fixed = " ".join(self.correct_word(w, "de") for w in de_words)
en_ok = en_fixed == exp_en
de_ok = de_fixed == exp_de
en_correct += en_ok
de_correct += de_ok
en_status = "OK" if en_ok else "MISS"
de_status = "OK" if de_ok else "MISS"
print(f" EN {en_status}: '{en_ocr}''{en_fixed}' (expected '{exp_en}')")
print(f" DE {de_status}: '{de_ocr}''{de_fixed}' (expected '{exp_de}')")
print(f"\nEN corrections: {en_correct}/{total} correct")
print(f"DE corrections: {de_correct}/{total} correct")