Add SmartSpellChecker + refactor vocab-worksheet page.tsx
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 45s
CI / test-go-edu-search (push) Successful in 43s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 36s
CI / test-nodejs-website (push) Successful in 37s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 45s
CI / test-go-edu-search (push) Successful in 43s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 36s
CI / test-nodejs-website (push) Successful in 37s
SmartSpellChecker (klausur-service): - Language-aware OCR post-correction without LLMs - Dual-dictionary heuristic for EN/DE language detection - Context-based a/I disambiguation via bigram lookup - Multi-digit substitution (sch00l→school) - Cross-language guard (don't false-correct DE words in EN column) - Umlaut correction (Schuler→Schüler, uber→über) - Integrated into spell_review_entries_sync() pipeline - 31 tests, 9ms/100 corrections Vocab-worksheet refactoring (studio-v2): - Split 2337-line page.tsx into 14 files - Custom hook useVocabWorksheet.ts (all state + logic) - 9 components in components/ directory - types.ts, constants.ts for shared definitions Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -881,10 +881,25 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
|
||||
"""Rule-based OCR correction: spell-checker + structural heuristics.
|
||||
|
||||
Deterministic — never translates, never touches IPA, never hallucinates.
|
||||
Uses SmartSpellChecker for language-aware corrections with context-based
|
||||
disambiguation (a/I), multi-digit substitution, and cross-language guard.
|
||||
"""
|
||||
t0 = time.time()
|
||||
changes: List[Dict] = []
|
||||
all_corrected: List[Dict] = []
|
||||
|
||||
# Use SmartSpellChecker if available, fall back to legacy _spell_fix_field
|
||||
_smart = None
|
||||
try:
|
||||
from smart_spell import SmartSpellChecker
|
||||
_smart = SmartSpellChecker()
|
||||
logger.debug("spell_review: using SmartSpellChecker")
|
||||
except Exception:
|
||||
logger.debug("spell_review: SmartSpellChecker not available, using legacy")
|
||||
|
||||
# Map field names → language codes for SmartSpellChecker
|
||||
_LANG_MAP = {"english": "en", "german": "de", "example": "auto"}
|
||||
|
||||
for i, entry in enumerate(entries):
|
||||
e = dict(entry)
|
||||
# Page-ref normalization (always, regardless of review status)
|
||||
@@ -907,9 +922,18 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
|
||||
old_val = (e.get(field_name) or "").strip()
|
||||
if not old_val:
|
||||
continue
|
||||
# example field is mixed-language — try German first (for umlauts)
|
||||
lang = "german" if field_name in ("german", "example") else "english"
|
||||
new_val, was_changed = _spell_fix_field(old_val, field=lang)
|
||||
|
||||
if _smart:
|
||||
# SmartSpellChecker path — language-aware, context-based
|
||||
lang_code = _LANG_MAP.get(field_name, "en")
|
||||
result = _smart.correct_text(old_val, lang=lang_code)
|
||||
new_val = result.corrected
|
||||
was_changed = result.changed
|
||||
else:
|
||||
# Legacy path
|
||||
lang = "german" if field_name in ("german", "example") else "english"
|
||||
new_val, was_changed = _spell_fix_field(old_val, field=lang)
|
||||
|
||||
if was_changed and new_val != old_val:
|
||||
changes.append({
|
||||
"row_index": e.get("row_index", i),
|
||||
@@ -921,12 +945,13 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
|
||||
e["llm_corrected"] = True
|
||||
all_corrected.append(e)
|
||||
duration_ms = int((time.time() - t0) * 1000)
|
||||
model_name = "smart-spell-checker" if _smart else "spell-checker"
|
||||
return {
|
||||
"entries_original": entries,
|
||||
"entries_corrected": all_corrected,
|
||||
"changes": changes,
|
||||
"skipped_count": 0,
|
||||
"model_used": "spell-checker",
|
||||
"model_used": model_name,
|
||||
"duration_ms": duration_ms,
|
||||
}
|
||||
|
||||
|
||||
369
klausur-service/backend/smart_spell.py
Normal file
369
klausur-service/backend/smart_spell.py
Normal file
@@ -0,0 +1,369 @@
|
||||
"""
|
||||
SmartSpellChecker — Language-aware OCR post-correction without LLMs.
|
||||
|
||||
Uses pyspellchecker (MIT) with dual EN+DE dictionaries for:
|
||||
- Automatic language detection per word (dual-dictionary heuristic)
|
||||
- OCR error correction (digit↔letter, umlauts, transpositions)
|
||||
- Context-based disambiguation (a/I, l/I) via bigram lookup
|
||||
- Mixed-language support for example sentences
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Literal, Optional, Set, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Init
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
try:
|
||||
from spellchecker import SpellChecker as _SpellChecker
|
||||
_en_spell = _SpellChecker(language='en', distance=1)
|
||||
_de_spell = _SpellChecker(language='de', distance=1)
|
||||
_AVAILABLE = True
|
||||
except ImportError:
|
||||
_AVAILABLE = False
|
||||
logger.warning("pyspellchecker not installed — SmartSpellChecker disabled")
|
||||
|
||||
Lang = Literal["en", "de", "both", "unknown"]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bigram context for a/I disambiguation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Words that commonly follow "I" (subject pronoun → verb/modal)
|
||||
_I_FOLLOWERS: frozenset = frozenset({
|
||||
"am", "was", "have", "had", "do", "did", "will", "would", "can",
|
||||
"could", "should", "shall", "may", "might", "must",
|
||||
"think", "know", "see", "want", "need", "like", "love", "hate",
|
||||
"go", "went", "come", "came", "say", "said", "get", "got",
|
||||
"make", "made", "take", "took", "give", "gave", "tell", "told",
|
||||
"feel", "felt", "find", "found", "believe", "hope", "wish",
|
||||
"remember", "forget", "understand", "mean", "meant",
|
||||
"don't", "didn't", "can't", "won't", "couldn't", "wouldn't",
|
||||
"shouldn't", "haven't", "hadn't", "isn't", "wasn't",
|
||||
"really", "just", "also", "always", "never", "often", "sometimes",
|
||||
})
|
||||
|
||||
# Words that commonly follow "a" (article → noun/adjective)
|
||||
_A_FOLLOWERS: frozenset = frozenset({
|
||||
"lot", "few", "little", "bit", "good", "bad", "great", "new", "old",
|
||||
"long", "short", "big", "small", "large", "huge", "tiny",
|
||||
"nice", "beautiful", "wonderful", "terrible", "horrible",
|
||||
"man", "woman", "boy", "girl", "child", "dog", "cat", "bird",
|
||||
"book", "car", "house", "room", "school", "teacher", "student",
|
||||
"day", "week", "month", "year", "time", "place", "way",
|
||||
"friend", "family", "person", "problem", "question", "story",
|
||||
"very", "really", "quite", "rather", "pretty", "single",
|
||||
})
|
||||
|
||||
# Digit→letter substitutions (OCR confusion)
|
||||
_DIGIT_SUBS: Dict[str, List[str]] = {
|
||||
'0': ['o', 'O'],
|
||||
'1': ['l', 'I'],
|
||||
'5': ['s', 'S'],
|
||||
'6': ['g', 'G'],
|
||||
'8': ['b', 'B'],
|
||||
'|': ['I', 'l'],
|
||||
}
|
||||
_SUSPICIOUS_CHARS = frozenset(_DIGIT_SUBS.keys())
|
||||
|
||||
# Umlaut confusion: OCR drops dots (ü→u, ä→a, ö→o)
|
||||
_UMLAUT_MAP = {
|
||||
'a': 'ä', 'o': 'ö', 'u': 'ü', 'i': 'ü',
|
||||
'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü',
|
||||
}
|
||||
|
||||
# Tokenizer
|
||||
_TOKEN_RE = re.compile(r"([A-Za-zÄÖÜäöüß'|]+)([^A-Za-zÄÖÜäöüß'|]*)")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data types
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class CorrectionResult:
|
||||
original: str
|
||||
corrected: str
|
||||
lang_detected: Lang
|
||||
changed: bool
|
||||
changes: List[str] = field(default_factory=list)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core class
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class SmartSpellChecker:
|
||||
"""Language-aware OCR spell checker using pyspellchecker (no LLM)."""
|
||||
|
||||
def __init__(self):
|
||||
if not _AVAILABLE:
|
||||
raise RuntimeError("pyspellchecker not installed")
|
||||
self.en = _en_spell
|
||||
self.de = _de_spell
|
||||
|
||||
# --- Language detection ---
|
||||
|
||||
def detect_word_lang(self, word: str) -> Lang:
|
||||
"""Detect language of a single word using dual-dict heuristic."""
|
||||
w = word.lower().strip(".,;:!?\"'()")
|
||||
if not w:
|
||||
return "unknown"
|
||||
in_en = bool(self.en.known([w]))
|
||||
in_de = bool(self.de.known([w]))
|
||||
if in_en and in_de:
|
||||
return "both"
|
||||
if in_en:
|
||||
return "en"
|
||||
if in_de:
|
||||
return "de"
|
||||
return "unknown"
|
||||
|
||||
def detect_text_lang(self, text: str) -> Lang:
|
||||
"""Detect dominant language of a text string (sentence/phrase)."""
|
||||
words = re.findall(r"[A-Za-zÄÖÜäöüß]+", text)
|
||||
if not words:
|
||||
return "unknown"
|
||||
|
||||
en_count = 0
|
||||
de_count = 0
|
||||
for w in words:
|
||||
lang = self.detect_word_lang(w)
|
||||
if lang == "en":
|
||||
en_count += 1
|
||||
elif lang == "de":
|
||||
de_count += 1
|
||||
# "both" doesn't count for either
|
||||
|
||||
if en_count > de_count:
|
||||
return "en"
|
||||
if de_count > en_count:
|
||||
return "de"
|
||||
if en_count == de_count and en_count > 0:
|
||||
return "both"
|
||||
return "unknown"
|
||||
|
||||
# --- Single-word correction ---
|
||||
|
||||
def _known(self, word: str) -> bool:
|
||||
"""True if word is known in EN or DE dictionary."""
|
||||
w = word.lower()
|
||||
return bool(self.en.known([w])) or bool(self.de.known([w]))
|
||||
|
||||
def _known_in(self, word: str, lang: str) -> bool:
|
||||
"""True if word is known in a specific language dictionary."""
|
||||
w = word.lower()
|
||||
spell = self.en if lang == "en" else self.de
|
||||
return bool(spell.known([w]))
|
||||
|
||||
def correct_word(self, word: str, lang: str = "en",
|
||||
prev_word: str = "", next_word: str = "") -> Optional[str]:
|
||||
"""Correct a single word for the given language.
|
||||
|
||||
Returns None if no correction needed, or the corrected string.
|
||||
|
||||
Args:
|
||||
word: The word to check/correct
|
||||
lang: Expected language ("en" or "de")
|
||||
prev_word: Previous word (for context)
|
||||
next_word: Next word (for context)
|
||||
"""
|
||||
if not word or not word.strip():
|
||||
return None
|
||||
|
||||
# Skip numbers, abbreviations with dots, very short tokens
|
||||
if word.isdigit() or '.' in word:
|
||||
return None
|
||||
|
||||
has_suspicious = any(ch in _SUSPICIOUS_CHARS for ch in word)
|
||||
|
||||
# 1. Already known → no fix
|
||||
if self._known(word):
|
||||
# But check a/I disambiguation for single-char words
|
||||
if word.lower() in ('l', '|') and next_word:
|
||||
return self._disambiguate_a_I(word, next_word)
|
||||
return None
|
||||
|
||||
# 2. Digit/pipe substitution
|
||||
if has_suspicious:
|
||||
if word == '|':
|
||||
return 'I'
|
||||
# Try single-char substitutions
|
||||
for i, ch in enumerate(word):
|
||||
if ch not in _DIGIT_SUBS:
|
||||
continue
|
||||
for replacement in _DIGIT_SUBS[ch]:
|
||||
candidate = word[:i] + replacement + word[i + 1:]
|
||||
if self._known(candidate):
|
||||
return candidate
|
||||
# Try multi-char substitution (e.g., "sch00l" → "school")
|
||||
multi = self._try_multi_digit_sub(word)
|
||||
if multi:
|
||||
return multi
|
||||
|
||||
# 3. Umlaut correction (German)
|
||||
if lang == "de" and len(word) >= 3 and word.isalpha():
|
||||
umlaut_fix = self._try_umlaut_fix(word)
|
||||
if umlaut_fix:
|
||||
return umlaut_fix
|
||||
|
||||
# 4. General spell correction
|
||||
if not has_suspicious and len(word) >= 3 and word.isalpha():
|
||||
# Safety: don't correct if the word is valid in the OTHER language
|
||||
# (either directly or via umlaut fix)
|
||||
other_lang = "de" if lang == "en" else "en"
|
||||
if self._known_in(word, other_lang):
|
||||
return None
|
||||
if other_lang == "de" and self._try_umlaut_fix(word):
|
||||
return None # has a valid DE umlaut variant → don't touch
|
||||
|
||||
spell = self.en if lang == "en" else self.de
|
||||
correction = spell.correction(word.lower())
|
||||
if correction and correction != word.lower():
|
||||
if word[0].isupper():
|
||||
correction = correction[0].upper() + correction[1:]
|
||||
if self._known(correction):
|
||||
return correction
|
||||
|
||||
return None
|
||||
|
||||
# --- Multi-digit substitution ---
|
||||
|
||||
def _try_multi_digit_sub(self, word: str) -> Optional[str]:
|
||||
"""Try replacing multiple digits simultaneously."""
|
||||
positions = [(i, ch) for i, ch in enumerate(word) if ch in _DIGIT_SUBS]
|
||||
if len(positions) < 1 or len(positions) > 4:
|
||||
return None
|
||||
|
||||
# Try all combinations (max 2^4 = 16 for 4 positions)
|
||||
chars = list(word)
|
||||
best = None
|
||||
self._multi_sub_recurse(chars, positions, 0, best_result=[None])
|
||||
return self._multi_sub_recurse_result
|
||||
|
||||
_multi_sub_recurse_result: Optional[str] = None
|
||||
|
||||
def _try_multi_digit_sub(self, word: str) -> Optional[str]:
|
||||
"""Try replacing multiple digits simultaneously using BFS."""
|
||||
positions = [(i, ch) for i, ch in enumerate(word) if ch in _DIGIT_SUBS]
|
||||
if not positions or len(positions) > 4:
|
||||
return None
|
||||
|
||||
# BFS over substitution combinations
|
||||
queue = [list(word)]
|
||||
for pos, ch in positions:
|
||||
next_queue = []
|
||||
for current in queue:
|
||||
# Keep original
|
||||
next_queue.append(current[:])
|
||||
# Try each substitution
|
||||
for repl in _DIGIT_SUBS[ch]:
|
||||
variant = current[:]
|
||||
variant[pos] = repl
|
||||
next_queue.append(variant)
|
||||
queue = next_queue
|
||||
|
||||
# Check which combinations produce known words
|
||||
for combo in queue:
|
||||
candidate = "".join(combo)
|
||||
if candidate != word and self._known(candidate):
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
# --- Umlaut fix ---
|
||||
|
||||
def _try_umlaut_fix(self, word: str) -> Optional[str]:
|
||||
"""Try single-char umlaut substitutions for German words."""
|
||||
for i, ch in enumerate(word):
|
||||
if ch in _UMLAUT_MAP:
|
||||
candidate = word[:i] + _UMLAUT_MAP[ch] + word[i + 1:]
|
||||
if self._known(candidate):
|
||||
return candidate
|
||||
return None
|
||||
|
||||
# --- a/I disambiguation ---
|
||||
|
||||
def _disambiguate_a_I(self, token: str, next_word: str) -> Optional[str]:
|
||||
"""Disambiguate 'a' vs 'I' (and OCR variants like 'l', '|')."""
|
||||
nw = next_word.lower().strip(".,;:!?")
|
||||
if nw in _I_FOLLOWERS:
|
||||
return "I"
|
||||
if nw in _A_FOLLOWERS:
|
||||
return "a"
|
||||
# Fallback: check if next word is more commonly a verb (→I) or noun/adj (→a)
|
||||
# Simple heuristic: if next word starts with uppercase (and isn't first in sentence)
|
||||
# it's likely a German noun following "I"... but in English context, uppercase
|
||||
# after "I" is unusual.
|
||||
return None # uncertain, don't change
|
||||
|
||||
# --- Full text correction ---
|
||||
|
||||
def correct_text(self, text: str, lang: str = "en") -> CorrectionResult:
|
||||
"""Correct a full text string (field value).
|
||||
|
||||
Args:
|
||||
text: The text to correct
|
||||
lang: Expected language ("en" or "de")
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return CorrectionResult(text, text, "unknown", False)
|
||||
|
||||
detected = self.detect_text_lang(text) if lang == "auto" else lang
|
||||
|
||||
parts: List[str] = []
|
||||
changes: List[str] = []
|
||||
tokens = list(_TOKEN_RE.finditer(text))
|
||||
|
||||
for idx, m in enumerate(tokens):
|
||||
token, sep = m.group(1), m.group(2)
|
||||
next_word = tokens[idx + 1].group(1) if idx + 1 < len(tokens) else ""
|
||||
prev_word = tokens[idx - 1].group(1) if idx > 0 else ""
|
||||
|
||||
correction = self.correct_word(
|
||||
token, lang=detected if detected in ("en", "de") else "en",
|
||||
prev_word=prev_word, next_word=next_word,
|
||||
)
|
||||
if correction and correction != token:
|
||||
changes.append(f"{token}→{correction}")
|
||||
parts.append(correction)
|
||||
else:
|
||||
parts.append(token)
|
||||
parts.append(sep)
|
||||
|
||||
# Append any trailing text
|
||||
last_end = tokens[-1].end() if tokens else 0
|
||||
if last_end < len(text):
|
||||
parts.append(text[last_end:])
|
||||
|
||||
corrected = "".join(parts)
|
||||
return CorrectionResult(
|
||||
original=text,
|
||||
corrected=corrected,
|
||||
lang_detected=detected,
|
||||
changed=corrected != text,
|
||||
changes=changes,
|
||||
)
|
||||
|
||||
# --- Vocabulary entry correction ---
|
||||
|
||||
def correct_vocab_entry(self, english: str, german: str,
|
||||
example: str = "") -> Dict[str, CorrectionResult]:
|
||||
"""Correct a full vocabulary entry (EN + DE + example).
|
||||
|
||||
Uses column position to determine language — the most reliable signal.
|
||||
"""
|
||||
results = {}
|
||||
results["english"] = self.correct_text(english, lang="en")
|
||||
results["german"] = self.correct_text(german, lang="de")
|
||||
if example:
|
||||
# For examples, auto-detect language
|
||||
results["example"] = self.correct_text(example, lang="auto")
|
||||
return results
|
||||
210
klausur-service/backend/tests/test_smart_spell.py
Normal file
210
klausur-service/backend/tests/test_smart_spell.py
Normal file
@@ -0,0 +1,210 @@
|
||||
"""Tests for SmartSpellChecker — language-aware OCR post-correction."""
|
||||
|
||||
import pytest
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
from smart_spell import SmartSpellChecker, CorrectionResult
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sc():
|
||||
return SmartSpellChecker()
|
||||
|
||||
|
||||
# ─── Language Detection ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestLanguageDetection:
|
||||
|
||||
def test_clear_english_words(self, sc):
|
||||
for word in ("school", "beautiful", "homework", "yesterday", "because"):
|
||||
assert sc.detect_word_lang(word) in ("en", "both"), f"{word} should be EN"
|
||||
|
||||
def test_clear_german_words(self, sc):
|
||||
for word in ("Schule", "Hausaufgaben", "Freundschaft", "Straße", "Entschuldigung"):
|
||||
assert sc.detect_word_lang(word) in ("de", "both"), f"{word} should be DE"
|
||||
|
||||
def test_ambiguous_words(self, sc):
|
||||
"""Words that exist in both languages."""
|
||||
for word in ("Hand", "Finger", "Arm", "Name", "Ball"):
|
||||
assert sc.detect_word_lang(word) == "both", f"{word} should be 'both'"
|
||||
|
||||
def test_unknown_words(self, sc):
|
||||
assert sc.detect_word_lang("xyzqwk") == "unknown"
|
||||
assert sc.detect_word_lang("") == "unknown"
|
||||
|
||||
def test_english_sentence(self, sc):
|
||||
assert sc.detect_text_lang("I go to school every day") == "en"
|
||||
|
||||
def test_german_sentence(self, sc):
|
||||
assert sc.detect_text_lang("Ich gehe jeden Tag zur Schule") == "de"
|
||||
|
||||
def test_mixed_sentence(self, sc):
|
||||
# Dominant language should win
|
||||
lang = sc.detect_text_lang("I like to play Fußball with my Freunde")
|
||||
assert lang in ("en", "both")
|
||||
|
||||
|
||||
# ─── Single Word Correction ────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestSingleWordCorrection:
|
||||
|
||||
def test_known_word_not_changed(self, sc):
|
||||
assert sc.correct_word("school", "en") is None
|
||||
assert sc.correct_word("Freund", "de") is None
|
||||
|
||||
def test_digit_letter_single(self, sc):
|
||||
assert sc.correct_word("g0od", "en") == "good"
|
||||
assert sc.correct_word("he1lo", "en") == "hello"
|
||||
|
||||
def test_digit_letter_multi(self, sc):
|
||||
"""Multiple digit substitutions (e.g., sch00l)."""
|
||||
result = sc.correct_word("sch00l", "en")
|
||||
assert result == "school", f"Expected 'school', got '{result}'"
|
||||
|
||||
def test_pipe_to_I(self, sc):
|
||||
assert sc.correct_word("|", "en") == "I"
|
||||
|
||||
def test_umlaut_schuler(self, sc):
|
||||
assert sc.correct_word("Schuler", "de") == "Schüler"
|
||||
|
||||
def test_umlaut_uber(self, sc):
|
||||
assert sc.correct_word("uber", "de") == "über"
|
||||
|
||||
def test_umlaut_bucher(self, sc):
|
||||
assert sc.correct_word("Bucher", "de") == "Bücher"
|
||||
|
||||
def test_umlaut_turkei(self, sc):
|
||||
assert sc.correct_word("Turkei", "de") == "Türkei"
|
||||
|
||||
def test_missing_char(self, sc):
|
||||
assert sc.correct_word("beautful", "en") == "beautiful"
|
||||
|
||||
def test_transposition(self, sc):
|
||||
assert sc.correct_word("teh", "en") == "the"
|
||||
|
||||
def test_swap(self, sc):
|
||||
assert sc.correct_word("freind", "en") == "friend"
|
||||
|
||||
def test_no_false_correction_cross_lang(self, sc):
|
||||
"""Don't correct a word that's valid in the other language.
|
||||
|
||||
'Schuler' in the EN column should NOT be corrected to 'Schuyler'
|
||||
because 'Schüler' is valid German — it's likely a German word
|
||||
that ended up in the wrong column (or is a surname).
|
||||
"""
|
||||
# Schuler is valid DE (after umlaut fix → Schüler), so
|
||||
# in the EN column it should be left alone
|
||||
result = sc.correct_word("Schuler", "en")
|
||||
# Should either be None (no change) or not "Schuyler"
|
||||
assert result != "Schuyler", "Should not false-correct German word in EN column"
|
||||
|
||||
|
||||
# ─── a/I Disambiguation ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestAIDisambiguation:
|
||||
|
||||
def test_I_before_verb(self, sc):
|
||||
assert sc._disambiguate_a_I("l", "am") == "I"
|
||||
assert sc._disambiguate_a_I("l", "was") == "I"
|
||||
assert sc._disambiguate_a_I("l", "think") == "I"
|
||||
assert sc._disambiguate_a_I("l", "have") == "I"
|
||||
assert sc._disambiguate_a_I("l", "don't") == "I"
|
||||
|
||||
def test_a_before_noun_adj(self, sc):
|
||||
assert sc._disambiguate_a_I("a", "book") == "a"
|
||||
assert sc._disambiguate_a_I("a", "cat") == "a"
|
||||
assert sc._disambiguate_a_I("a", "big") == "a"
|
||||
assert sc._disambiguate_a_I("a", "lot") == "a"
|
||||
|
||||
def test_uncertain_returns_none(self, sc):
|
||||
"""When context is ambiguous, return None (don't change)."""
|
||||
assert sc._disambiguate_a_I("l", "xyzqwk") is None
|
||||
|
||||
|
||||
# ─── Full Text Correction ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestFullTextCorrection:
|
||||
|
||||
def test_english_sentence(self, sc):
|
||||
result = sc.correct_text("teh cat is beautful", "en")
|
||||
assert result.changed
|
||||
assert "the" in result.corrected
|
||||
assert "beautiful" in result.corrected
|
||||
|
||||
def test_german_sentence_no_change(self, sc):
|
||||
result = sc.correct_text("Ich gehe zur Schule", "de")
|
||||
assert not result.changed
|
||||
|
||||
def test_german_umlaut_fix(self, sc):
|
||||
result = sc.correct_text("Der Schuler liest Bucher", "de")
|
||||
assert "Schüler" in result.corrected
|
||||
assert "Bücher" in result.corrected
|
||||
|
||||
def test_preserves_punctuation(self, sc):
|
||||
result = sc.correct_text("teh cat, beautful!", "en")
|
||||
assert "," in result.corrected
|
||||
assert "!" in result.corrected
|
||||
|
||||
def test_empty_text(self, sc):
|
||||
result = sc.correct_text("", "en")
|
||||
assert not result.changed
|
||||
assert result.corrected == ""
|
||||
|
||||
|
||||
# ─── Vocab Entry Correction ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestVocabEntryCorrection:
|
||||
|
||||
def test_basic_entry(self, sc):
|
||||
results = sc.correct_vocab_entry(
|
||||
english="beautful",
|
||||
german="schön",
|
||||
)
|
||||
assert results["english"].corrected == "beautiful"
|
||||
assert results["german"].changed is False
|
||||
|
||||
def test_umlaut_in_german(self, sc):
|
||||
results = sc.correct_vocab_entry(
|
||||
english="school",
|
||||
german="Schuler",
|
||||
)
|
||||
assert results["english"].changed is False
|
||||
assert results["german"].corrected == "Schüler"
|
||||
|
||||
def test_example_auto_detect(self, sc):
|
||||
results = sc.correct_vocab_entry(
|
||||
english="friend",
|
||||
german="Freund",
|
||||
example="My best freind lives in Berlin",
|
||||
)
|
||||
assert "friend" in results["example"].corrected
|
||||
|
||||
|
||||
# ─── Speed ─────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestSpeed:
|
||||
|
||||
def test_100_corrections_under_500ms(self, sc):
|
||||
"""100 word corrections should complete in under 500ms."""
|
||||
import time
|
||||
words = [
|
||||
("beautful", "en"), ("teh", "en"), ("freind", "en"),
|
||||
("homwork", "en"), ("yesturday", "en"),
|
||||
("Schuler", "de"), ("Bucher", "de"), ("Turkei", "de"),
|
||||
("uber", "de"), ("Ubung", "de"),
|
||||
] * 10
|
||||
|
||||
t0 = time.time()
|
||||
for word, lang in words:
|
||||
sc.correct_word(word, lang)
|
||||
dt = time.time() - t0
|
||||
|
||||
print(f"\n 100 corrections in {dt*1000:.0f}ms")
|
||||
assert dt < 0.5, f"Too slow: {dt*1000:.0f}ms"
|
||||
494
klausur-service/backend/tests/test_spell_benchmark.py
Normal file
494
klausur-service/backend/tests/test_spell_benchmark.py
Normal file
@@ -0,0 +1,494 @@
|
||||
"""
|
||||
Benchmark: Spell-checking & language detection approaches for OCR post-correction.
|
||||
|
||||
Tests pyspellchecker (already used), symspellpy (candidate), and
|
||||
dual-dictionary language detection heuristic on real vocabulary OCR data.
|
||||
|
||||
Run: pytest tests/test_spell_benchmark.py -v -s
|
||||
"""
|
||||
|
||||
import time
|
||||
import pytest
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _load_pyspellchecker():
|
||||
from spellchecker import SpellChecker
|
||||
en = SpellChecker(language='en', distance=1)
|
||||
de = SpellChecker(language='de', distance=1)
|
||||
return en, de
|
||||
|
||||
|
||||
def _load_symspellpy():
|
||||
"""Load symspellpy with English frequency dict (bundled)."""
|
||||
from symspellpy import SymSpell, Verbosity
|
||||
sym = SymSpell(max_dictionary_edit_distance=2)
|
||||
# Use bundled English frequency dict
|
||||
import pkg_resources
|
||||
dict_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
|
||||
sym.load_dictionary(dict_path, term_index=0, count_index=1)
|
||||
return sym, Verbosity
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test data: (ocr_output, expected_correction, language, category)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
OCR_TEST_CASES = [
|
||||
# --- Single-char ambiguity ---
|
||||
("l am a student", "I am a student", "en", "a_vs_I"),
|
||||
("a book", "a book", "en", "a_vs_I"), # should NOT change
|
||||
("I like cats", "I like cats", "en", "a_vs_I"), # should NOT change
|
||||
("lt is raining", "It is raining", "en", "a_vs_I"), # l→I at start
|
||||
|
||||
# --- Digit-letter confusion ---
|
||||
("g0od", "good", "en", "digit_letter"),
|
||||
("sch00l", "school", "en", "digit_letter"),
|
||||
("he1lo", "hello", "en", "digit_letter"),
|
||||
("Sch0n", "Schon", "de", "digit_letter"), # German
|
||||
|
||||
# --- Umlaut drops ---
|
||||
("schon", "schön", "de", "umlaut"), # context: "schon" is also valid DE!
|
||||
("Schuler", "Schüler", "de", "umlaut"),
|
||||
("uber", "über", "de", "umlaut"),
|
||||
("Bucher", "Bücher", "de", "umlaut"),
|
||||
("Turkei", "Türkei", "de", "umlaut"),
|
||||
|
||||
# --- Common OCR errors ---
|
||||
("beautful", "beautiful", "en", "missing_char"),
|
||||
("teh", "the", "en", "transposition"),
|
||||
("becasue", "because", "en", "transposition"),
|
||||
("freind", "friend", "en", "swap"),
|
||||
("Freund", "Freund", "de", "correct"), # already correct
|
||||
|
||||
# --- Merged words ---
|
||||
("atmyschool", "at my school", "en", "merged"),
|
||||
("goodidea", "good idea", "en", "merged"),
|
||||
|
||||
# --- Mixed language example sentences ---
|
||||
("I go to teh school", "I go to the school", "en", "sentence"),
|
||||
("Ich gehe zur Schule", "Ich gehe zur Schule", "de", "sentence_correct"),
|
||||
]
|
||||
|
||||
# Language detection test: (word, expected_language)
|
||||
LANG_DETECT_CASES = [
|
||||
# Clear English
|
||||
("school", "en"),
|
||||
("beautiful", "en"),
|
||||
("homework", "en"),
|
||||
("yesterday", "en"),
|
||||
("children", "en"),
|
||||
("because", "en"),
|
||||
("environment", "en"),
|
||||
("although", "en"),
|
||||
|
||||
# Clear German
|
||||
("Schule", "de"),
|
||||
("Hausaufgaben", "de"),
|
||||
("Freundschaft", "de"),
|
||||
("Umwelt", "de"),
|
||||
("Kindergarten", "de"), # also used in English!
|
||||
("Bücher", "de"),
|
||||
("Straße", "de"),
|
||||
("Entschuldigung", "de"),
|
||||
|
||||
# Ambiguous (exist in both)
|
||||
("Hand", "both"),
|
||||
("Finger", "both"),
|
||||
("Arm", "both"),
|
||||
("Name", "both"),
|
||||
("Ball", "both"),
|
||||
|
||||
# Short/tricky
|
||||
("a", "en"),
|
||||
("I", "en"),
|
||||
("in", "both"),
|
||||
("an", "both"),
|
||||
("the", "en"),
|
||||
("die", "de"),
|
||||
("der", "de"),
|
||||
("to", "en"),
|
||||
("zu", "de"),
|
||||
]
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Tests
|
||||
# ===========================================================================
|
||||
|
||||
|
||||
class TestPyspellchecker:
|
||||
"""Test pyspellchecker capabilities for OCR correction."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self):
|
||||
self.en, self.de = _load_pyspellchecker()
|
||||
|
||||
def test_known_words(self):
|
||||
"""Verify basic dictionary lookup."""
|
||||
assert self.en.known(["school"])
|
||||
assert self.en.known(["beautiful"])
|
||||
assert self.de.known(["schule"]) # lowercase
|
||||
assert self.de.known(["freund"])
|
||||
# Not known
|
||||
assert not self.en.known(["xyzqwk"])
|
||||
assert not self.de.known(["xyzqwk"])
|
||||
|
||||
def test_correction_quality(self):
|
||||
"""Test correction suggestions for OCR errors."""
|
||||
results = []
|
||||
for ocr, expected, lang, category in OCR_TEST_CASES:
|
||||
if category in ("sentence", "sentence_correct", "merged", "a_vs_I"):
|
||||
continue # skip multi-word cases
|
||||
|
||||
spell = self.en if lang == "en" else self.de
|
||||
words = ocr.split()
|
||||
corrected = []
|
||||
for w in words:
|
||||
if spell.known([w.lower()]):
|
||||
corrected.append(w)
|
||||
else:
|
||||
fix = spell.correction(w.lower())
|
||||
if fix and fix != w.lower():
|
||||
# Preserve case
|
||||
if w[0].isupper():
|
||||
fix = fix[0].upper() + fix[1:]
|
||||
corrected.append(fix)
|
||||
else:
|
||||
corrected.append(w)
|
||||
result = " ".join(corrected)
|
||||
ok = result == expected
|
||||
results.append((ocr, expected, result, ok, category))
|
||||
if not ok:
|
||||
print(f" MISS: '{ocr}' → '{result}' (expected '{expected}') [{category}]")
|
||||
else:
|
||||
print(f" OK: '{ocr}' → '{result}' [{category}]")
|
||||
|
||||
correct = sum(1 for *_, ok, _ in results if ok)
|
||||
total = len(results)
|
||||
print(f"\npyspellchecker: {correct}/{total} correct ({100*correct/total:.0f}%)")
|
||||
|
||||
def test_language_detection_heuristic(self):
|
||||
"""Test dual-dictionary language detection."""
|
||||
results = []
|
||||
for word, expected_lang in LANG_DETECT_CASES:
|
||||
w = word.lower()
|
||||
in_en = bool(self.en.known([w]))
|
||||
in_de = bool(self.de.known([w]))
|
||||
|
||||
if in_en and in_de:
|
||||
detected = "both"
|
||||
elif in_en:
|
||||
detected = "en"
|
||||
elif in_de:
|
||||
detected = "de"
|
||||
else:
|
||||
detected = "unknown"
|
||||
|
||||
ok = detected == expected_lang
|
||||
results.append((word, expected_lang, detected, ok))
|
||||
if not ok:
|
||||
print(f" MISS: '{word}' → {detected} (expected {expected_lang})")
|
||||
else:
|
||||
print(f" OK: '{word}' → {detected}")
|
||||
|
||||
correct = sum(1 for *_, ok in results if ok)
|
||||
total = len(results)
|
||||
print(f"\nLang detection heuristic: {correct}/{total} correct ({100*correct/total:.0f}%)")
|
||||
|
||||
def test_umlaut_awareness(self):
|
||||
"""Test if pyspellchecker suggests umlaut corrections."""
|
||||
# "Schuler" should suggest "Schüler"
|
||||
candidates = self.de.candidates("schuler")
|
||||
print(f" 'schuler' candidates: {candidates}")
|
||||
# "uber" should suggest "über"
|
||||
candidates_uber = self.de.candidates("uber")
|
||||
print(f" 'uber' candidates: {candidates_uber}")
|
||||
# "Turkei" should suggest "Türkei"
|
||||
candidates_turkei = self.de.candidates("turkei")
|
||||
print(f" 'turkei' candidates: {candidates_turkei}")
|
||||
|
||||
def test_speed_100_words(self):
|
||||
"""Measure correction speed for 100 words."""
|
||||
words_en = ["beautful", "teh", "becasue", "freind", "shcool",
|
||||
"homwork", "yesturday", "chilren", "becuse", "enviroment"] * 10
|
||||
t0 = time.time()
|
||||
for w in words_en:
|
||||
self.en.correction(w)
|
||||
dt = time.time() - t0
|
||||
print(f"\n pyspellchecker: 100 EN corrections in {dt*1000:.0f}ms")
|
||||
|
||||
words_de = ["schuler", "bucher", "turkei", "strasze", "entschuldigung",
|
||||
"kindergaten", "freumd", "hauaufgaben", "umwlt", "ubung"] * 10
|
||||
t0 = time.time()
|
||||
for w in words_de:
|
||||
self.de.correction(w)
|
||||
dt = time.time() - t0
|
||||
print(f" pyspellchecker: 100 DE corrections in {dt*1000:.0f}ms")
|
||||
|
||||
|
||||
class TestSymspellpy:
|
||||
"""Test symspellpy as a faster alternative."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self):
|
||||
try:
|
||||
self.sym, self.Verbosity = _load_symspellpy()
|
||||
self.available = True
|
||||
except (ImportError, FileNotFoundError) as e:
|
||||
self.available = False
|
||||
pytest.skip(f"symspellpy not installed: {e}")
|
||||
|
||||
def test_correction_quality(self):
|
||||
"""Test symspellpy corrections (EN only — no DE dict bundled)."""
|
||||
en_cases = [(o, e, c) for o, e, _, c in OCR_TEST_CASES
|
||||
if _ == "en" and c not in ("sentence", "sentence_correct", "merged", "a_vs_I")]
|
||||
|
||||
results = []
|
||||
for ocr, expected, category in en_cases:
|
||||
suggestions = self.sym.lookup(ocr.lower(), self.Verbosity.CLOSEST, max_edit_distance=2)
|
||||
if suggestions:
|
||||
fix = suggestions[0].term
|
||||
if ocr[0].isupper():
|
||||
fix = fix[0].upper() + fix[1:]
|
||||
result = fix
|
||||
else:
|
||||
result = ocr
|
||||
|
||||
ok = result == expected
|
||||
results.append((ocr, expected, result, ok, category))
|
||||
status = "OK" if ok else "MISS"
|
||||
print(f" {status}: '{ocr}' → '{result}' (expected '{expected}') [{category}]")
|
||||
|
||||
correct = sum(1 for *_, ok, _ in results if ok)
|
||||
total = len(results)
|
||||
print(f"\nsymspellpy EN: {correct}/{total} correct ({100*correct/total:.0f}%)")
|
||||
|
||||
def test_speed_100_words(self):
|
||||
"""Measure symspellpy correction speed for 100 words."""
|
||||
words = ["beautful", "teh", "becasue", "freind", "shcool",
|
||||
"homwork", "yesturday", "chilren", "becuse", "enviroment"] * 10
|
||||
t0 = time.time()
|
||||
for w in words:
|
||||
self.sym.lookup(w, self.Verbosity.CLOSEST, max_edit_distance=2)
|
||||
dt = time.time() - t0
|
||||
print(f"\n symspellpy: 100 EN corrections in {dt*1000:.0f}ms")
|
||||
|
||||
def test_compound_segmentation(self):
|
||||
"""Test symspellpy's word segmentation for merged words."""
|
||||
cases = [
|
||||
("atmyschool", "at my school"),
|
||||
("goodidea", "good idea"),
|
||||
("makeadecision", "make a decision"),
|
||||
]
|
||||
for merged, expected in cases:
|
||||
result = self.sym.word_segmentation(merged)
|
||||
ok = result.corrected_string == expected
|
||||
status = "OK" if ok else "MISS"
|
||||
print(f" {status}: '{merged}' → '{result.corrected_string}' (expected '{expected}')")
|
||||
|
||||
|
||||
class TestContextDisambiguation:
|
||||
"""Test context-based disambiguation for a/I and similar cases."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self):
|
||||
self.en, self.de = _load_pyspellchecker()
|
||||
|
||||
def test_bigram_context(self):
|
||||
"""Use simple bigram heuristic for a/I disambiguation.
|
||||
|
||||
Approach: check if 'a <next_word>' or 'I <next_word>' is more
|
||||
common by checking if <next_word> is a noun (follows 'a') or
|
||||
verb (follows 'I').
|
||||
"""
|
||||
# Common words that follow "I" (verbs)
|
||||
i_followers = {"am", "was", "have", "had", "do", "did", "will",
|
||||
"would", "can", "could", "should", "shall", "may",
|
||||
"might", "think", "know", "see", "want", "need",
|
||||
"like", "love", "hate", "go", "went", "come",
|
||||
"came", "say", "said", "get", "got", "make", "made",
|
||||
"take", "took", "give", "gave", "tell", "told",
|
||||
"feel", "felt", "find", "found", "believe", "hope",
|
||||
"remember", "forget", "understand", "mean", "meant",
|
||||
"don't", "didn't", "can't", "won't", "couldn't",
|
||||
"shouldn't", "wouldn't", "haven't", "hadn't"}
|
||||
|
||||
# Common words that follow "a" (nouns/adjectives)
|
||||
a_followers = {"lot", "few", "little", "bit", "good", "bad",
|
||||
"big", "small", "great", "new", "old", "long",
|
||||
"short", "man", "woman", "boy", "girl", "dog",
|
||||
"cat", "book", "car", "house", "day", "year",
|
||||
"nice", "beautiful", "large", "huge", "tiny"}
|
||||
|
||||
def disambiguate_a_I(token: str, next_word: str) -> str:
|
||||
"""Given an ambiguous 'a' or 'I' (or 'l'), pick the right one."""
|
||||
nw = next_word.lower()
|
||||
if nw in i_followers:
|
||||
return "I"
|
||||
if nw in a_followers:
|
||||
return "a"
|
||||
# Fallback: if next word is known verb → I, known adj/noun → a
|
||||
# For now, use a simple heuristic: lowercase → "a", uppercase first letter → "I"
|
||||
return token # no change if uncertain
|
||||
|
||||
cases = [
|
||||
("l", "am", "I"),
|
||||
("l", "was", "I"),
|
||||
("l", "think", "I"),
|
||||
("a", "book", "a"),
|
||||
("a", "cat", "a"),
|
||||
("a", "lot", "a"),
|
||||
("l", "big", "a"), # "a big ..."
|
||||
("a", "have", "I"), # "I have ..."
|
||||
]
|
||||
|
||||
results = []
|
||||
for token, next_word, expected in cases:
|
||||
result = disambiguate_a_I(token, next_word)
|
||||
ok = result == expected
|
||||
results.append((token, next_word, expected, result, ok))
|
||||
status = "OK" if ok else "MISS"
|
||||
print(f" {status}: '{token} {next_word}...' → '{result}' (expected '{expected}')")
|
||||
|
||||
correct = sum(1 for *_, ok in results if ok)
|
||||
total = len(results)
|
||||
print(f"\na/I disambiguation: {correct}/{total} correct ({100*correct/total:.0f}%)")
|
||||
|
||||
|
||||
class TestLangDetectLibrary:
|
||||
"""Test py3langid or langdetect if available."""
|
||||
|
||||
def test_py3langid(self):
|
||||
try:
|
||||
import langid
|
||||
except ImportError:
|
||||
pytest.skip("langid not installed")
|
||||
|
||||
sentences = [
|
||||
("I go to school every day", "en"),
|
||||
("Ich gehe jeden Tag zur Schule", "de"),
|
||||
("The weather is nice today", "en"),
|
||||
("Das Wetter ist heute schön", "de"),
|
||||
("She likes to play football", "en"),
|
||||
("Er spielt gerne Fußball", "de"),
|
||||
]
|
||||
|
||||
results = []
|
||||
for text, expected in sentences:
|
||||
lang, confidence = langid.classify(text)
|
||||
ok = lang == expected
|
||||
results.append(ok)
|
||||
status = "OK" if ok else "MISS"
|
||||
print(f" {status}: '{text[:40]}...' → {lang} ({confidence:.2f}) (expected {expected})")
|
||||
|
||||
correct = sum(results)
|
||||
print(f"\nlangid sentence detection: {correct}/{len(results)} correct")
|
||||
|
||||
def test_langid_single_words(self):
|
||||
"""langid on single words — expected to be unreliable."""
|
||||
try:
|
||||
import langid
|
||||
except ImportError:
|
||||
pytest.skip("langid not installed")
|
||||
|
||||
words = [("school", "en"), ("Schule", "de"), ("book", "en"),
|
||||
("Buch", "de"), ("car", "en"), ("Auto", "de"),
|
||||
("a", "en"), ("I", "en"), ("der", "de"), ("the", "en")]
|
||||
|
||||
results = []
|
||||
for word, expected in words:
|
||||
lang, conf = langid.classify(word)
|
||||
ok = lang == expected
|
||||
results.append(ok)
|
||||
status = "OK" if ok else "MISS"
|
||||
print(f" {status}: '{word}' → {lang} ({conf:.2f}) (expected {expected})")
|
||||
|
||||
correct = sum(results)
|
||||
print(f"\nlangid single-word: {correct}/{len(results)} correct")
|
||||
|
||||
|
||||
class TestIntegratedApproach:
|
||||
"""Test the combined approach: dict-heuristic for lang + spell correction."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self):
|
||||
self.en, self.de = _load_pyspellchecker()
|
||||
|
||||
def detect_language(self, word: str) -> str:
|
||||
"""Dual-dict heuristic language detection."""
|
||||
w = word.lower()
|
||||
# Skip very short words — too ambiguous
|
||||
if len(w) <= 2:
|
||||
return "ambiguous"
|
||||
in_en = bool(self.en.known([w]))
|
||||
in_de = bool(self.de.known([w]))
|
||||
if in_en and in_de:
|
||||
return "both"
|
||||
if in_en:
|
||||
return "en"
|
||||
if in_de:
|
||||
return "de"
|
||||
return "unknown"
|
||||
|
||||
def correct_word(self, word: str, expected_lang: str) -> str:
|
||||
"""Correct a single word given the expected language."""
|
||||
w_lower = word.lower()
|
||||
spell = self.en if expected_lang == "en" else self.de
|
||||
|
||||
# Already known
|
||||
if spell.known([w_lower]):
|
||||
return word
|
||||
|
||||
# Also check the other language — might be fine
|
||||
other = self.de if expected_lang == "en" else self.en
|
||||
if other.known([w_lower]):
|
||||
return word # valid in the other language
|
||||
|
||||
# Try correction
|
||||
fix = spell.correction(w_lower)
|
||||
if fix and fix != w_lower:
|
||||
if word[0].isupper():
|
||||
fix = fix[0].upper() + fix[1:]
|
||||
return fix
|
||||
|
||||
return word
|
||||
|
||||
def test_full_pipeline(self):
|
||||
"""Test: detect language → correct with appropriate dict."""
|
||||
vocab_entries = [
|
||||
# (english_col, german_col, expected_en, expected_de)
|
||||
("beautful", "schön", "beautiful", "schön"),
|
||||
("school", "Schule", "school", "Schule"),
|
||||
("teh cat", "die Katze", "the cat", "die Katze"),
|
||||
("freind", "Freund", "friend", "Freund"),
|
||||
("homwork", "Hausaufgaben", "homework", "Hausaufgaben"),
|
||||
("Schuler", "Schuler", "Schuler", "Schüler"), # DE umlaut: Schüler
|
||||
]
|
||||
|
||||
en_correct = 0
|
||||
de_correct = 0
|
||||
total = len(vocab_entries)
|
||||
|
||||
for en_ocr, de_ocr, exp_en, exp_de in vocab_entries:
|
||||
# Correct each word in the column
|
||||
en_words = en_ocr.split()
|
||||
de_words = de_ocr.split()
|
||||
en_fixed = " ".join(self.correct_word(w, "en") for w in en_words)
|
||||
de_fixed = " ".join(self.correct_word(w, "de") for w in de_words)
|
||||
|
||||
en_ok = en_fixed == exp_en
|
||||
de_ok = de_fixed == exp_de
|
||||
en_correct += en_ok
|
||||
de_correct += de_ok
|
||||
|
||||
en_status = "OK" if en_ok else "MISS"
|
||||
de_status = "OK" if de_ok else "MISS"
|
||||
print(f" EN {en_status}: '{en_ocr}' → '{en_fixed}' (expected '{exp_en}')")
|
||||
print(f" DE {de_status}: '{de_ocr}' → '{de_fixed}' (expected '{exp_de}')")
|
||||
|
||||
print(f"\nEN corrections: {en_correct}/{total} correct")
|
||||
print(f"DE corrections: {de_correct}/{total} correct")
|
||||
Reference in New Issue
Block a user