diff --git a/klausur-service/backend/cv_review.py b/klausur-service/backend/cv_review.py index c31d416..5da85c2 100644 --- a/klausur-service/backend/cv_review.py +++ b/klausur-service/backend/cv_review.py @@ -881,10 +881,25 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict: """Rule-based OCR correction: spell-checker + structural heuristics. Deterministic — never translates, never touches IPA, never hallucinates. + Uses SmartSpellChecker for language-aware corrections with context-based + disambiguation (a/I), multi-digit substitution, and cross-language guard. """ t0 = time.time() changes: List[Dict] = [] all_corrected: List[Dict] = [] + + # Use SmartSpellChecker if available, fall back to legacy _spell_fix_field + _smart = None + try: + from smart_spell import SmartSpellChecker + _smart = SmartSpellChecker() + logger.debug("spell_review: using SmartSpellChecker") + except Exception: + logger.debug("spell_review: SmartSpellChecker not available, using legacy") + + # Map field names → language codes for SmartSpellChecker + _LANG_MAP = {"english": "en", "german": "de", "example": "auto"} + for i, entry in enumerate(entries): e = dict(entry) # Page-ref normalization (always, regardless of review status) @@ -907,9 +922,18 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict: old_val = (e.get(field_name) or "").strip() if not old_val: continue - # example field is mixed-language — try German first (for umlauts) - lang = "german" if field_name in ("german", "example") else "english" - new_val, was_changed = _spell_fix_field(old_val, field=lang) + + if _smart: + # SmartSpellChecker path — language-aware, context-based + lang_code = _LANG_MAP.get(field_name, "en") + result = _smart.correct_text(old_val, lang=lang_code) + new_val = result.corrected + was_changed = result.changed + else: + # Legacy path + lang = "german" if field_name in ("german", "example") else "english" + new_val, was_changed = _spell_fix_field(old_val, field=lang) + if was_changed and new_val != old_val: changes.append({ "row_index": e.get("row_index", i), @@ -921,12 +945,13 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict: e["llm_corrected"] = True all_corrected.append(e) duration_ms = int((time.time() - t0) * 1000) + model_name = "smart-spell-checker" if _smart else "spell-checker" return { "entries_original": entries, "entries_corrected": all_corrected, "changes": changes, "skipped_count": 0, - "model_used": "spell-checker", + "model_used": model_name, "duration_ms": duration_ms, } diff --git a/klausur-service/backend/smart_spell.py b/klausur-service/backend/smart_spell.py new file mode 100644 index 0000000..3b5cf18 --- /dev/null +++ b/klausur-service/backend/smart_spell.py @@ -0,0 +1,369 @@ +""" +SmartSpellChecker — Language-aware OCR post-correction without LLMs. + +Uses pyspellchecker (MIT) with dual EN+DE dictionaries for: +- Automatic language detection per word (dual-dictionary heuristic) +- OCR error correction (digit↔letter, umlauts, transpositions) +- Context-based disambiguation (a/I, l/I) via bigram lookup +- Mixed-language support for example sentences + +Lizenz: Apache 2.0 (kommerziell nutzbar) +""" + +import logging +import re +from dataclasses import dataclass, field +from typing import Dict, List, Literal, Optional, Set, Tuple + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Init +# --------------------------------------------------------------------------- + +try: + from spellchecker import SpellChecker as _SpellChecker + _en_spell = _SpellChecker(language='en', distance=1) + _de_spell = _SpellChecker(language='de', distance=1) + _AVAILABLE = True +except ImportError: + _AVAILABLE = False + logger.warning("pyspellchecker not installed — SmartSpellChecker disabled") + +Lang = Literal["en", "de", "both", "unknown"] + +# --------------------------------------------------------------------------- +# Bigram context for a/I disambiguation +# --------------------------------------------------------------------------- + +# Words that commonly follow "I" (subject pronoun → verb/modal) +_I_FOLLOWERS: frozenset = frozenset({ + "am", "was", "have", "had", "do", "did", "will", "would", "can", + "could", "should", "shall", "may", "might", "must", + "think", "know", "see", "want", "need", "like", "love", "hate", + "go", "went", "come", "came", "say", "said", "get", "got", + "make", "made", "take", "took", "give", "gave", "tell", "told", + "feel", "felt", "find", "found", "believe", "hope", "wish", + "remember", "forget", "understand", "mean", "meant", + "don't", "didn't", "can't", "won't", "couldn't", "wouldn't", + "shouldn't", "haven't", "hadn't", "isn't", "wasn't", + "really", "just", "also", "always", "never", "often", "sometimes", +}) + +# Words that commonly follow "a" (article → noun/adjective) +_A_FOLLOWERS: frozenset = frozenset({ + "lot", "few", "little", "bit", "good", "bad", "great", "new", "old", + "long", "short", "big", "small", "large", "huge", "tiny", + "nice", "beautiful", "wonderful", "terrible", "horrible", + "man", "woman", "boy", "girl", "child", "dog", "cat", "bird", + "book", "car", "house", "room", "school", "teacher", "student", + "day", "week", "month", "year", "time", "place", "way", + "friend", "family", "person", "problem", "question", "story", + "very", "really", "quite", "rather", "pretty", "single", +}) + +# Digit→letter substitutions (OCR confusion) +_DIGIT_SUBS: Dict[str, List[str]] = { + '0': ['o', 'O'], + '1': ['l', 'I'], + '5': ['s', 'S'], + '6': ['g', 'G'], + '8': ['b', 'B'], + '|': ['I', 'l'], +} +_SUSPICIOUS_CHARS = frozenset(_DIGIT_SUBS.keys()) + +# Umlaut confusion: OCR drops dots (ü→u, ä→a, ö→o) +_UMLAUT_MAP = { + 'a': 'ä', 'o': 'ö', 'u': 'ü', 'i': 'ü', + 'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü', +} + +# Tokenizer +_TOKEN_RE = re.compile(r"([A-Za-zÄÖÜäöüß'|]+)([^A-Za-zÄÖÜäöüß'|]*)") + + +# --------------------------------------------------------------------------- +# Data types +# --------------------------------------------------------------------------- + +@dataclass +class CorrectionResult: + original: str + corrected: str + lang_detected: Lang + changed: bool + changes: List[str] = field(default_factory=list) + + +# --------------------------------------------------------------------------- +# Core class +# --------------------------------------------------------------------------- + +class SmartSpellChecker: + """Language-aware OCR spell checker using pyspellchecker (no LLM).""" + + def __init__(self): + if not _AVAILABLE: + raise RuntimeError("pyspellchecker not installed") + self.en = _en_spell + self.de = _de_spell + + # --- Language detection --- + + def detect_word_lang(self, word: str) -> Lang: + """Detect language of a single word using dual-dict heuristic.""" + w = word.lower().strip(".,;:!?\"'()") + if not w: + return "unknown" + in_en = bool(self.en.known([w])) + in_de = bool(self.de.known([w])) + if in_en and in_de: + return "both" + if in_en: + return "en" + if in_de: + return "de" + return "unknown" + + def detect_text_lang(self, text: str) -> Lang: + """Detect dominant language of a text string (sentence/phrase).""" + words = re.findall(r"[A-Za-zÄÖÜäöüß]+", text) + if not words: + return "unknown" + + en_count = 0 + de_count = 0 + for w in words: + lang = self.detect_word_lang(w) + if lang == "en": + en_count += 1 + elif lang == "de": + de_count += 1 + # "both" doesn't count for either + + if en_count > de_count: + return "en" + if de_count > en_count: + return "de" + if en_count == de_count and en_count > 0: + return "both" + return "unknown" + + # --- Single-word correction --- + + def _known(self, word: str) -> bool: + """True if word is known in EN or DE dictionary.""" + w = word.lower() + return bool(self.en.known([w])) or bool(self.de.known([w])) + + def _known_in(self, word: str, lang: str) -> bool: + """True if word is known in a specific language dictionary.""" + w = word.lower() + spell = self.en if lang == "en" else self.de + return bool(spell.known([w])) + + def correct_word(self, word: str, lang: str = "en", + prev_word: str = "", next_word: str = "") -> Optional[str]: + """Correct a single word for the given language. + + Returns None if no correction needed, or the corrected string. + + Args: + word: The word to check/correct + lang: Expected language ("en" or "de") + prev_word: Previous word (for context) + next_word: Next word (for context) + """ + if not word or not word.strip(): + return None + + # Skip numbers, abbreviations with dots, very short tokens + if word.isdigit() or '.' in word: + return None + + has_suspicious = any(ch in _SUSPICIOUS_CHARS for ch in word) + + # 1. Already known → no fix + if self._known(word): + # But check a/I disambiguation for single-char words + if word.lower() in ('l', '|') and next_word: + return self._disambiguate_a_I(word, next_word) + return None + + # 2. Digit/pipe substitution + if has_suspicious: + if word == '|': + return 'I' + # Try single-char substitutions + for i, ch in enumerate(word): + if ch not in _DIGIT_SUBS: + continue + for replacement in _DIGIT_SUBS[ch]: + candidate = word[:i] + replacement + word[i + 1:] + if self._known(candidate): + return candidate + # Try multi-char substitution (e.g., "sch00l" → "school") + multi = self._try_multi_digit_sub(word) + if multi: + return multi + + # 3. Umlaut correction (German) + if lang == "de" and len(word) >= 3 and word.isalpha(): + umlaut_fix = self._try_umlaut_fix(word) + if umlaut_fix: + return umlaut_fix + + # 4. General spell correction + if not has_suspicious and len(word) >= 3 and word.isalpha(): + # Safety: don't correct if the word is valid in the OTHER language + # (either directly or via umlaut fix) + other_lang = "de" if lang == "en" else "en" + if self._known_in(word, other_lang): + return None + if other_lang == "de" and self._try_umlaut_fix(word): + return None # has a valid DE umlaut variant → don't touch + + spell = self.en if lang == "en" else self.de + correction = spell.correction(word.lower()) + if correction and correction != word.lower(): + if word[0].isupper(): + correction = correction[0].upper() + correction[1:] + if self._known(correction): + return correction + + return None + + # --- Multi-digit substitution --- + + def _try_multi_digit_sub(self, word: str) -> Optional[str]: + """Try replacing multiple digits simultaneously.""" + positions = [(i, ch) for i, ch in enumerate(word) if ch in _DIGIT_SUBS] + if len(positions) < 1 or len(positions) > 4: + return None + + # Try all combinations (max 2^4 = 16 for 4 positions) + chars = list(word) + best = None + self._multi_sub_recurse(chars, positions, 0, best_result=[None]) + return self._multi_sub_recurse_result + + _multi_sub_recurse_result: Optional[str] = None + + def _try_multi_digit_sub(self, word: str) -> Optional[str]: + """Try replacing multiple digits simultaneously using BFS.""" + positions = [(i, ch) for i, ch in enumerate(word) if ch in _DIGIT_SUBS] + if not positions or len(positions) > 4: + return None + + # BFS over substitution combinations + queue = [list(word)] + for pos, ch in positions: + next_queue = [] + for current in queue: + # Keep original + next_queue.append(current[:]) + # Try each substitution + for repl in _DIGIT_SUBS[ch]: + variant = current[:] + variant[pos] = repl + next_queue.append(variant) + queue = next_queue + + # Check which combinations produce known words + for combo in queue: + candidate = "".join(combo) + if candidate != word and self._known(candidate): + return candidate + + return None + + # --- Umlaut fix --- + + def _try_umlaut_fix(self, word: str) -> Optional[str]: + """Try single-char umlaut substitutions for German words.""" + for i, ch in enumerate(word): + if ch in _UMLAUT_MAP: + candidate = word[:i] + _UMLAUT_MAP[ch] + word[i + 1:] + if self._known(candidate): + return candidate + return None + + # --- a/I disambiguation --- + + def _disambiguate_a_I(self, token: str, next_word: str) -> Optional[str]: + """Disambiguate 'a' vs 'I' (and OCR variants like 'l', '|').""" + nw = next_word.lower().strip(".,;:!?") + if nw in _I_FOLLOWERS: + return "I" + if nw in _A_FOLLOWERS: + return "a" + # Fallback: check if next word is more commonly a verb (→I) or noun/adj (→a) + # Simple heuristic: if next word starts with uppercase (and isn't first in sentence) + # it's likely a German noun following "I"... but in English context, uppercase + # after "I" is unusual. + return None # uncertain, don't change + + # --- Full text correction --- + + def correct_text(self, text: str, lang: str = "en") -> CorrectionResult: + """Correct a full text string (field value). + + Args: + text: The text to correct + lang: Expected language ("en" or "de") + """ + if not text or not text.strip(): + return CorrectionResult(text, text, "unknown", False) + + detected = self.detect_text_lang(text) if lang == "auto" else lang + + parts: List[str] = [] + changes: List[str] = [] + tokens = list(_TOKEN_RE.finditer(text)) + + for idx, m in enumerate(tokens): + token, sep = m.group(1), m.group(2) + next_word = tokens[idx + 1].group(1) if idx + 1 < len(tokens) else "" + prev_word = tokens[idx - 1].group(1) if idx > 0 else "" + + correction = self.correct_word( + token, lang=detected if detected in ("en", "de") else "en", + prev_word=prev_word, next_word=next_word, + ) + if correction and correction != token: + changes.append(f"{token}→{correction}") + parts.append(correction) + else: + parts.append(token) + parts.append(sep) + + # Append any trailing text + last_end = tokens[-1].end() if tokens else 0 + if last_end < len(text): + parts.append(text[last_end:]) + + corrected = "".join(parts) + return CorrectionResult( + original=text, + corrected=corrected, + lang_detected=detected, + changed=corrected != text, + changes=changes, + ) + + # --- Vocabulary entry correction --- + + def correct_vocab_entry(self, english: str, german: str, + example: str = "") -> Dict[str, CorrectionResult]: + """Correct a full vocabulary entry (EN + DE + example). + + Uses column position to determine language — the most reliable signal. + """ + results = {} + results["english"] = self.correct_text(english, lang="en") + results["german"] = self.correct_text(german, lang="de") + if example: + # For examples, auto-detect language + results["example"] = self.correct_text(example, lang="auto") + return results diff --git a/klausur-service/backend/tests/test_smart_spell.py b/klausur-service/backend/tests/test_smart_spell.py new file mode 100644 index 0000000..2816803 --- /dev/null +++ b/klausur-service/backend/tests/test_smart_spell.py @@ -0,0 +1,210 @@ +"""Tests for SmartSpellChecker — language-aware OCR post-correction.""" + +import pytest +import sys, os +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from smart_spell import SmartSpellChecker, CorrectionResult + + +@pytest.fixture +def sc(): + return SmartSpellChecker() + + +# ─── Language Detection ────────────────────────────────────────────────────── + + +class TestLanguageDetection: + + def test_clear_english_words(self, sc): + for word in ("school", "beautiful", "homework", "yesterday", "because"): + assert sc.detect_word_lang(word) in ("en", "both"), f"{word} should be EN" + + def test_clear_german_words(self, sc): + for word in ("Schule", "Hausaufgaben", "Freundschaft", "Straße", "Entschuldigung"): + assert sc.detect_word_lang(word) in ("de", "both"), f"{word} should be DE" + + def test_ambiguous_words(self, sc): + """Words that exist in both languages.""" + for word in ("Hand", "Finger", "Arm", "Name", "Ball"): + assert sc.detect_word_lang(word) == "both", f"{word} should be 'both'" + + def test_unknown_words(self, sc): + assert sc.detect_word_lang("xyzqwk") == "unknown" + assert sc.detect_word_lang("") == "unknown" + + def test_english_sentence(self, sc): + assert sc.detect_text_lang("I go to school every day") == "en" + + def test_german_sentence(self, sc): + assert sc.detect_text_lang("Ich gehe jeden Tag zur Schule") == "de" + + def test_mixed_sentence(self, sc): + # Dominant language should win + lang = sc.detect_text_lang("I like to play Fußball with my Freunde") + assert lang in ("en", "both") + + +# ─── Single Word Correction ──────────────────────────────────────────────── + + +class TestSingleWordCorrection: + + def test_known_word_not_changed(self, sc): + assert sc.correct_word("school", "en") is None + assert sc.correct_word("Freund", "de") is None + + def test_digit_letter_single(self, sc): + assert sc.correct_word("g0od", "en") == "good" + assert sc.correct_word("he1lo", "en") == "hello" + + def test_digit_letter_multi(self, sc): + """Multiple digit substitutions (e.g., sch00l).""" + result = sc.correct_word("sch00l", "en") + assert result == "school", f"Expected 'school', got '{result}'" + + def test_pipe_to_I(self, sc): + assert sc.correct_word("|", "en") == "I" + + def test_umlaut_schuler(self, sc): + assert sc.correct_word("Schuler", "de") == "Schüler" + + def test_umlaut_uber(self, sc): + assert sc.correct_word("uber", "de") == "über" + + def test_umlaut_bucher(self, sc): + assert sc.correct_word("Bucher", "de") == "Bücher" + + def test_umlaut_turkei(self, sc): + assert sc.correct_word("Turkei", "de") == "Türkei" + + def test_missing_char(self, sc): + assert sc.correct_word("beautful", "en") == "beautiful" + + def test_transposition(self, sc): + assert sc.correct_word("teh", "en") == "the" + + def test_swap(self, sc): + assert sc.correct_word("freind", "en") == "friend" + + def test_no_false_correction_cross_lang(self, sc): + """Don't correct a word that's valid in the other language. + + 'Schuler' in the EN column should NOT be corrected to 'Schuyler' + because 'Schüler' is valid German — it's likely a German word + that ended up in the wrong column (or is a surname). + """ + # Schuler is valid DE (after umlaut fix → Schüler), so + # in the EN column it should be left alone + result = sc.correct_word("Schuler", "en") + # Should either be None (no change) or not "Schuyler" + assert result != "Schuyler", "Should not false-correct German word in EN column" + + +# ─── a/I Disambiguation ────────────────────────────────────────────────────── + + +class TestAIDisambiguation: + + def test_I_before_verb(self, sc): + assert sc._disambiguate_a_I("l", "am") == "I" + assert sc._disambiguate_a_I("l", "was") == "I" + assert sc._disambiguate_a_I("l", "think") == "I" + assert sc._disambiguate_a_I("l", "have") == "I" + assert sc._disambiguate_a_I("l", "don't") == "I" + + def test_a_before_noun_adj(self, sc): + assert sc._disambiguate_a_I("a", "book") == "a" + assert sc._disambiguate_a_I("a", "cat") == "a" + assert sc._disambiguate_a_I("a", "big") == "a" + assert sc._disambiguate_a_I("a", "lot") == "a" + + def test_uncertain_returns_none(self, sc): + """When context is ambiguous, return None (don't change).""" + assert sc._disambiguate_a_I("l", "xyzqwk") is None + + +# ─── Full Text Correction ─────────────────────────────────────────────────── + + +class TestFullTextCorrection: + + def test_english_sentence(self, sc): + result = sc.correct_text("teh cat is beautful", "en") + assert result.changed + assert "the" in result.corrected + assert "beautiful" in result.corrected + + def test_german_sentence_no_change(self, sc): + result = sc.correct_text("Ich gehe zur Schule", "de") + assert not result.changed + + def test_german_umlaut_fix(self, sc): + result = sc.correct_text("Der Schuler liest Bucher", "de") + assert "Schüler" in result.corrected + assert "Bücher" in result.corrected + + def test_preserves_punctuation(self, sc): + result = sc.correct_text("teh cat, beautful!", "en") + assert "," in result.corrected + assert "!" in result.corrected + + def test_empty_text(self, sc): + result = sc.correct_text("", "en") + assert not result.changed + assert result.corrected == "" + + +# ─── Vocab Entry Correction ───────────────────────────────────────────────── + + +class TestVocabEntryCorrection: + + def test_basic_entry(self, sc): + results = sc.correct_vocab_entry( + english="beautful", + german="schön", + ) + assert results["english"].corrected == "beautiful" + assert results["german"].changed is False + + def test_umlaut_in_german(self, sc): + results = sc.correct_vocab_entry( + english="school", + german="Schuler", + ) + assert results["english"].changed is False + assert results["german"].corrected == "Schüler" + + def test_example_auto_detect(self, sc): + results = sc.correct_vocab_entry( + english="friend", + german="Freund", + example="My best freind lives in Berlin", + ) + assert "friend" in results["example"].corrected + + +# ─── Speed ───────────────────────────────────────────────────────────────── + + +class TestSpeed: + + def test_100_corrections_under_500ms(self, sc): + """100 word corrections should complete in under 500ms.""" + import time + words = [ + ("beautful", "en"), ("teh", "en"), ("freind", "en"), + ("homwork", "en"), ("yesturday", "en"), + ("Schuler", "de"), ("Bucher", "de"), ("Turkei", "de"), + ("uber", "de"), ("Ubung", "de"), + ] * 10 + + t0 = time.time() + for word, lang in words: + sc.correct_word(word, lang) + dt = time.time() - t0 + + print(f"\n 100 corrections in {dt*1000:.0f}ms") + assert dt < 0.5, f"Too slow: {dt*1000:.0f}ms" diff --git a/klausur-service/backend/tests/test_spell_benchmark.py b/klausur-service/backend/tests/test_spell_benchmark.py new file mode 100644 index 0000000..3b37c63 --- /dev/null +++ b/klausur-service/backend/tests/test_spell_benchmark.py @@ -0,0 +1,494 @@ +""" +Benchmark: Spell-checking & language detection approaches for OCR post-correction. + +Tests pyspellchecker (already used), symspellpy (candidate), and +dual-dictionary language detection heuristic on real vocabulary OCR data. + +Run: pytest tests/test_spell_benchmark.py -v -s +""" + +import time +import pytest + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _load_pyspellchecker(): + from spellchecker import SpellChecker + en = SpellChecker(language='en', distance=1) + de = SpellChecker(language='de', distance=1) + return en, de + + +def _load_symspellpy(): + """Load symspellpy with English frequency dict (bundled).""" + from symspellpy import SymSpell, Verbosity + sym = SymSpell(max_dictionary_edit_distance=2) + # Use bundled English frequency dict + import pkg_resources + dict_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt") + sym.load_dictionary(dict_path, term_index=0, count_index=1) + return sym, Verbosity + + +# --------------------------------------------------------------------------- +# Test data: (ocr_output, expected_correction, language, category) +# --------------------------------------------------------------------------- + +OCR_TEST_CASES = [ + # --- Single-char ambiguity --- + ("l am a student", "I am a student", "en", "a_vs_I"), + ("a book", "a book", "en", "a_vs_I"), # should NOT change + ("I like cats", "I like cats", "en", "a_vs_I"), # should NOT change + ("lt is raining", "It is raining", "en", "a_vs_I"), # l→I at start + + # --- Digit-letter confusion --- + ("g0od", "good", "en", "digit_letter"), + ("sch00l", "school", "en", "digit_letter"), + ("he1lo", "hello", "en", "digit_letter"), + ("Sch0n", "Schon", "de", "digit_letter"), # German + + # --- Umlaut drops --- + ("schon", "schön", "de", "umlaut"), # context: "schon" is also valid DE! + ("Schuler", "Schüler", "de", "umlaut"), + ("uber", "über", "de", "umlaut"), + ("Bucher", "Bücher", "de", "umlaut"), + ("Turkei", "Türkei", "de", "umlaut"), + + # --- Common OCR errors --- + ("beautful", "beautiful", "en", "missing_char"), + ("teh", "the", "en", "transposition"), + ("becasue", "because", "en", "transposition"), + ("freind", "friend", "en", "swap"), + ("Freund", "Freund", "de", "correct"), # already correct + + # --- Merged words --- + ("atmyschool", "at my school", "en", "merged"), + ("goodidea", "good idea", "en", "merged"), + + # --- Mixed language example sentences --- + ("I go to teh school", "I go to the school", "en", "sentence"), + ("Ich gehe zur Schule", "Ich gehe zur Schule", "de", "sentence_correct"), +] + +# Language detection test: (word, expected_language) +LANG_DETECT_CASES = [ + # Clear English + ("school", "en"), + ("beautiful", "en"), + ("homework", "en"), + ("yesterday", "en"), + ("children", "en"), + ("because", "en"), + ("environment", "en"), + ("although", "en"), + + # Clear German + ("Schule", "de"), + ("Hausaufgaben", "de"), + ("Freundschaft", "de"), + ("Umwelt", "de"), + ("Kindergarten", "de"), # also used in English! + ("Bücher", "de"), + ("Straße", "de"), + ("Entschuldigung", "de"), + + # Ambiguous (exist in both) + ("Hand", "both"), + ("Finger", "both"), + ("Arm", "both"), + ("Name", "both"), + ("Ball", "both"), + + # Short/tricky + ("a", "en"), + ("I", "en"), + ("in", "both"), + ("an", "both"), + ("the", "en"), + ("die", "de"), + ("der", "de"), + ("to", "en"), + ("zu", "de"), +] + + +# =========================================================================== +# Tests +# =========================================================================== + + +class TestPyspellchecker: + """Test pyspellchecker capabilities for OCR correction.""" + + @pytest.fixture(autouse=True) + def setup(self): + self.en, self.de = _load_pyspellchecker() + + def test_known_words(self): + """Verify basic dictionary lookup.""" + assert self.en.known(["school"]) + assert self.en.known(["beautiful"]) + assert self.de.known(["schule"]) # lowercase + assert self.de.known(["freund"]) + # Not known + assert not self.en.known(["xyzqwk"]) + assert not self.de.known(["xyzqwk"]) + + def test_correction_quality(self): + """Test correction suggestions for OCR errors.""" + results = [] + for ocr, expected, lang, category in OCR_TEST_CASES: + if category in ("sentence", "sentence_correct", "merged", "a_vs_I"): + continue # skip multi-word cases + + spell = self.en if lang == "en" else self.de + words = ocr.split() + corrected = [] + for w in words: + if spell.known([w.lower()]): + corrected.append(w) + else: + fix = spell.correction(w.lower()) + if fix and fix != w.lower(): + # Preserve case + if w[0].isupper(): + fix = fix[0].upper() + fix[1:] + corrected.append(fix) + else: + corrected.append(w) + result = " ".join(corrected) + ok = result == expected + results.append((ocr, expected, result, ok, category)) + if not ok: + print(f" MISS: '{ocr}' → '{result}' (expected '{expected}') [{category}]") + else: + print(f" OK: '{ocr}' → '{result}' [{category}]") + + correct = sum(1 for *_, ok, _ in results if ok) + total = len(results) + print(f"\npyspellchecker: {correct}/{total} correct ({100*correct/total:.0f}%)") + + def test_language_detection_heuristic(self): + """Test dual-dictionary language detection.""" + results = [] + for word, expected_lang in LANG_DETECT_CASES: + w = word.lower() + in_en = bool(self.en.known([w])) + in_de = bool(self.de.known([w])) + + if in_en and in_de: + detected = "both" + elif in_en: + detected = "en" + elif in_de: + detected = "de" + else: + detected = "unknown" + + ok = detected == expected_lang + results.append((word, expected_lang, detected, ok)) + if not ok: + print(f" MISS: '{word}' → {detected} (expected {expected_lang})") + else: + print(f" OK: '{word}' → {detected}") + + correct = sum(1 for *_, ok in results if ok) + total = len(results) + print(f"\nLang detection heuristic: {correct}/{total} correct ({100*correct/total:.0f}%)") + + def test_umlaut_awareness(self): + """Test if pyspellchecker suggests umlaut corrections.""" + # "Schuler" should suggest "Schüler" + candidates = self.de.candidates("schuler") + print(f" 'schuler' candidates: {candidates}") + # "uber" should suggest "über" + candidates_uber = self.de.candidates("uber") + print(f" 'uber' candidates: {candidates_uber}") + # "Turkei" should suggest "Türkei" + candidates_turkei = self.de.candidates("turkei") + print(f" 'turkei' candidates: {candidates_turkei}") + + def test_speed_100_words(self): + """Measure correction speed for 100 words.""" + words_en = ["beautful", "teh", "becasue", "freind", "shcool", + "homwork", "yesturday", "chilren", "becuse", "enviroment"] * 10 + t0 = time.time() + for w in words_en: + self.en.correction(w) + dt = time.time() - t0 + print(f"\n pyspellchecker: 100 EN corrections in {dt*1000:.0f}ms") + + words_de = ["schuler", "bucher", "turkei", "strasze", "entschuldigung", + "kindergaten", "freumd", "hauaufgaben", "umwlt", "ubung"] * 10 + t0 = time.time() + for w in words_de: + self.de.correction(w) + dt = time.time() - t0 + print(f" pyspellchecker: 100 DE corrections in {dt*1000:.0f}ms") + + +class TestSymspellpy: + """Test symspellpy as a faster alternative.""" + + @pytest.fixture(autouse=True) + def setup(self): + try: + self.sym, self.Verbosity = _load_symspellpy() + self.available = True + except (ImportError, FileNotFoundError) as e: + self.available = False + pytest.skip(f"symspellpy not installed: {e}") + + def test_correction_quality(self): + """Test symspellpy corrections (EN only — no DE dict bundled).""" + en_cases = [(o, e, c) for o, e, _, c in OCR_TEST_CASES + if _ == "en" and c not in ("sentence", "sentence_correct", "merged", "a_vs_I")] + + results = [] + for ocr, expected, category in en_cases: + suggestions = self.sym.lookup(ocr.lower(), self.Verbosity.CLOSEST, max_edit_distance=2) + if suggestions: + fix = suggestions[0].term + if ocr[0].isupper(): + fix = fix[0].upper() + fix[1:] + result = fix + else: + result = ocr + + ok = result == expected + results.append((ocr, expected, result, ok, category)) + status = "OK" if ok else "MISS" + print(f" {status}: '{ocr}' → '{result}' (expected '{expected}') [{category}]") + + correct = sum(1 for *_, ok, _ in results if ok) + total = len(results) + print(f"\nsymspellpy EN: {correct}/{total} correct ({100*correct/total:.0f}%)") + + def test_speed_100_words(self): + """Measure symspellpy correction speed for 100 words.""" + words = ["beautful", "teh", "becasue", "freind", "shcool", + "homwork", "yesturday", "chilren", "becuse", "enviroment"] * 10 + t0 = time.time() + for w in words: + self.sym.lookup(w, self.Verbosity.CLOSEST, max_edit_distance=2) + dt = time.time() - t0 + print(f"\n symspellpy: 100 EN corrections in {dt*1000:.0f}ms") + + def test_compound_segmentation(self): + """Test symspellpy's word segmentation for merged words.""" + cases = [ + ("atmyschool", "at my school"), + ("goodidea", "good idea"), + ("makeadecision", "make a decision"), + ] + for merged, expected in cases: + result = self.sym.word_segmentation(merged) + ok = result.corrected_string == expected + status = "OK" if ok else "MISS" + print(f" {status}: '{merged}' → '{result.corrected_string}' (expected '{expected}')") + + +class TestContextDisambiguation: + """Test context-based disambiguation for a/I and similar cases.""" + + @pytest.fixture(autouse=True) + def setup(self): + self.en, self.de = _load_pyspellchecker() + + def test_bigram_context(self): + """Use simple bigram heuristic for a/I disambiguation. + + Approach: check if 'a ' or 'I ' is more + common by checking if is a noun (follows 'a') or + verb (follows 'I'). + """ + # Common words that follow "I" (verbs) + i_followers = {"am", "was", "have", "had", "do", "did", "will", + "would", "can", "could", "should", "shall", "may", + "might", "think", "know", "see", "want", "need", + "like", "love", "hate", "go", "went", "come", + "came", "say", "said", "get", "got", "make", "made", + "take", "took", "give", "gave", "tell", "told", + "feel", "felt", "find", "found", "believe", "hope", + "remember", "forget", "understand", "mean", "meant", + "don't", "didn't", "can't", "won't", "couldn't", + "shouldn't", "wouldn't", "haven't", "hadn't"} + + # Common words that follow "a" (nouns/adjectives) + a_followers = {"lot", "few", "little", "bit", "good", "bad", + "big", "small", "great", "new", "old", "long", + "short", "man", "woman", "boy", "girl", "dog", + "cat", "book", "car", "house", "day", "year", + "nice", "beautiful", "large", "huge", "tiny"} + + def disambiguate_a_I(token: str, next_word: str) -> str: + """Given an ambiguous 'a' or 'I' (or 'l'), pick the right one.""" + nw = next_word.lower() + if nw in i_followers: + return "I" + if nw in a_followers: + return "a" + # Fallback: if next word is known verb → I, known adj/noun → a + # For now, use a simple heuristic: lowercase → "a", uppercase first letter → "I" + return token # no change if uncertain + + cases = [ + ("l", "am", "I"), + ("l", "was", "I"), + ("l", "think", "I"), + ("a", "book", "a"), + ("a", "cat", "a"), + ("a", "lot", "a"), + ("l", "big", "a"), # "a big ..." + ("a", "have", "I"), # "I have ..." + ] + + results = [] + for token, next_word, expected in cases: + result = disambiguate_a_I(token, next_word) + ok = result == expected + results.append((token, next_word, expected, result, ok)) + status = "OK" if ok else "MISS" + print(f" {status}: '{token} {next_word}...' → '{result}' (expected '{expected}')") + + correct = sum(1 for *_, ok in results if ok) + total = len(results) + print(f"\na/I disambiguation: {correct}/{total} correct ({100*correct/total:.0f}%)") + + +class TestLangDetectLibrary: + """Test py3langid or langdetect if available.""" + + def test_py3langid(self): + try: + import langid + except ImportError: + pytest.skip("langid not installed") + + sentences = [ + ("I go to school every day", "en"), + ("Ich gehe jeden Tag zur Schule", "de"), + ("The weather is nice today", "en"), + ("Das Wetter ist heute schön", "de"), + ("She likes to play football", "en"), + ("Er spielt gerne Fußball", "de"), + ] + + results = [] + for text, expected in sentences: + lang, confidence = langid.classify(text) + ok = lang == expected + results.append(ok) + status = "OK" if ok else "MISS" + print(f" {status}: '{text[:40]}...' → {lang} ({confidence:.2f}) (expected {expected})") + + correct = sum(results) + print(f"\nlangid sentence detection: {correct}/{len(results)} correct") + + def test_langid_single_words(self): + """langid on single words — expected to be unreliable.""" + try: + import langid + except ImportError: + pytest.skip("langid not installed") + + words = [("school", "en"), ("Schule", "de"), ("book", "en"), + ("Buch", "de"), ("car", "en"), ("Auto", "de"), + ("a", "en"), ("I", "en"), ("der", "de"), ("the", "en")] + + results = [] + for word, expected in words: + lang, conf = langid.classify(word) + ok = lang == expected + results.append(ok) + status = "OK" if ok else "MISS" + print(f" {status}: '{word}' → {lang} ({conf:.2f}) (expected {expected})") + + correct = sum(results) + print(f"\nlangid single-word: {correct}/{len(results)} correct") + + +class TestIntegratedApproach: + """Test the combined approach: dict-heuristic for lang + spell correction.""" + + @pytest.fixture(autouse=True) + def setup(self): + self.en, self.de = _load_pyspellchecker() + + def detect_language(self, word: str) -> str: + """Dual-dict heuristic language detection.""" + w = word.lower() + # Skip very short words — too ambiguous + if len(w) <= 2: + return "ambiguous" + in_en = bool(self.en.known([w])) + in_de = bool(self.de.known([w])) + if in_en and in_de: + return "both" + if in_en: + return "en" + if in_de: + return "de" + return "unknown" + + def correct_word(self, word: str, expected_lang: str) -> str: + """Correct a single word given the expected language.""" + w_lower = word.lower() + spell = self.en if expected_lang == "en" else self.de + + # Already known + if spell.known([w_lower]): + return word + + # Also check the other language — might be fine + other = self.de if expected_lang == "en" else self.en + if other.known([w_lower]): + return word # valid in the other language + + # Try correction + fix = spell.correction(w_lower) + if fix and fix != w_lower: + if word[0].isupper(): + fix = fix[0].upper() + fix[1:] + return fix + + return word + + def test_full_pipeline(self): + """Test: detect language → correct with appropriate dict.""" + vocab_entries = [ + # (english_col, german_col, expected_en, expected_de) + ("beautful", "schön", "beautiful", "schön"), + ("school", "Schule", "school", "Schule"), + ("teh cat", "die Katze", "the cat", "die Katze"), + ("freind", "Freund", "friend", "Freund"), + ("homwork", "Hausaufgaben", "homework", "Hausaufgaben"), + ("Schuler", "Schuler", "Schuler", "Schüler"), # DE umlaut: Schüler + ] + + en_correct = 0 + de_correct = 0 + total = len(vocab_entries) + + for en_ocr, de_ocr, exp_en, exp_de in vocab_entries: + # Correct each word in the column + en_words = en_ocr.split() + de_words = de_ocr.split() + en_fixed = " ".join(self.correct_word(w, "en") for w in en_words) + de_fixed = " ".join(self.correct_word(w, "de") for w in de_words) + + en_ok = en_fixed == exp_en + de_ok = de_fixed == exp_de + en_correct += en_ok + de_correct += de_ok + + en_status = "OK" if en_ok else "MISS" + de_status = "OK" if de_ok else "MISS" + print(f" EN {en_status}: '{en_ocr}' → '{en_fixed}' (expected '{exp_en}')") + print(f" DE {de_status}: '{de_ocr}' → '{de_fixed}' (expected '{exp_de}')") + + print(f"\nEN corrections: {en_correct}/{total} correct") + print(f"DE corrections: {de_correct}/{total} correct") diff --git a/studio-v2/app/vocab-worksheet/components/ExportTab.tsx b/studio-v2/app/vocab-worksheet/components/ExportTab.tsx new file mode 100644 index 0000000..6604b95 --- /dev/null +++ b/studio-v2/app/vocab-worksheet/components/ExportTab.tsx @@ -0,0 +1,57 @@ +'use client' + +import React from 'react' +import type { VocabWorksheetHook } from '../types' + +export function ExportTab({ h }: { h: VocabWorksheetHook }) { + const { isDark, glassCard } = h + + return ( +
+

PDF herunterladen

+ + {h.worksheetId ? ( +
+
+
+ + + + Arbeitsblatt erfolgreich generiert! +
+
+ +
+ + + {h.includeSolutions && ( + + )} +
+ + +
+ ) : ( +

Noch kein Arbeitsblatt generiert.

+ )} +
+ ) +} diff --git a/studio-v2/app/vocab-worksheet/components/FullscreenPreview.tsx b/studio-v2/app/vocab-worksheet/components/FullscreenPreview.tsx new file mode 100644 index 0000000..64394b6 --- /dev/null +++ b/studio-v2/app/vocab-worksheet/components/FullscreenPreview.tsx @@ -0,0 +1,39 @@ +'use client' + +import React from 'react' +import type { VocabWorksheetHook } from '../types' + +export function FullscreenPreview({ h }: { h: VocabWorksheetHook }) { + return ( +
h.setShowFullPreview(false)}> + +
e.stopPropagation()}> + {h.directFile?.type.startsWith('image/') && h.directFilePreview && ( + Original + )} + {h.directFile?.type === 'application/pdf' && h.directFilePreview && ( +