""" SmartSpellChecker — Language-aware OCR post-correction without LLMs. Uses pyspellchecker (MIT) with dual EN+DE dictionaries for: - Automatic language detection per word (dual-dictionary heuristic) - OCR error correction (digit↔letter, umlauts, transpositions) - Context-based disambiguation (a/I, l/I) via bigram lookup - Mixed-language support for example sentences Lizenz: Apache 2.0 (kommerziell nutzbar) """ import logging import re from dataclasses import dataclass, field from typing import Dict, List, Literal, Optional, Set, Tuple logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Init # --------------------------------------------------------------------------- try: from spellchecker import SpellChecker as _SpellChecker _en_spell = _SpellChecker(language='en', distance=1) _de_spell = _SpellChecker(language='de', distance=1) _AVAILABLE = True except ImportError: _AVAILABLE = False logger.warning("pyspellchecker not installed — SmartSpellChecker disabled") Lang = Literal["en", "de", "both", "unknown"] # --------------------------------------------------------------------------- # Bigram context for a/I disambiguation # --------------------------------------------------------------------------- # Words that commonly follow "I" (subject pronoun → verb/modal) _I_FOLLOWERS: frozenset = frozenset({ "am", "was", "have", "had", "do", "did", "will", "would", "can", "could", "should", "shall", "may", "might", "must", "think", "know", "see", "want", "need", "like", "love", "hate", "go", "went", "come", "came", "say", "said", "get", "got", "make", "made", "take", "took", "give", "gave", "tell", "told", "feel", "felt", "find", "found", "believe", "hope", "wish", "remember", "forget", "understand", "mean", "meant", "don't", "didn't", "can't", "won't", "couldn't", "wouldn't", "shouldn't", "haven't", "hadn't", "isn't", "wasn't", "really", "just", "also", "always", "never", "often", "sometimes", }) # Words that commonly follow "a" (article → noun/adjective) _A_FOLLOWERS: frozenset = frozenset({ "lot", "few", "little", "bit", "good", "bad", "great", "new", "old", "long", "short", "big", "small", "large", "huge", "tiny", "nice", "beautiful", "wonderful", "terrible", "horrible", "man", "woman", "boy", "girl", "child", "dog", "cat", "bird", "book", "car", "house", "room", "school", "teacher", "student", "day", "week", "month", "year", "time", "place", "way", "friend", "family", "person", "problem", "question", "story", "very", "really", "quite", "rather", "pretty", "single", }) # Digit→letter substitutions (OCR confusion) _DIGIT_SUBS: Dict[str, List[str]] = { '0': ['o', 'O'], '1': ['l', 'I'], '5': ['s', 'S'], '6': ['g', 'G'], '8': ['b', 'B'], '|': ['I', 'l'], } _SUSPICIOUS_CHARS = frozenset(_DIGIT_SUBS.keys()) # Umlaut confusion: OCR drops dots (ü→u, ä→a, ö→o) _UMLAUT_MAP = { 'a': 'ä', 'o': 'ö', 'u': 'ü', 'i': 'ü', 'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü', } # Tokenizer _TOKEN_RE = re.compile(r"([A-Za-zÄÖÜäöüß'|]+)([^A-Za-zÄÖÜäöüß'|]*)") # --------------------------------------------------------------------------- # Data types # --------------------------------------------------------------------------- @dataclass class CorrectionResult: original: str corrected: str lang_detected: Lang changed: bool changes: List[str] = field(default_factory=list) # --------------------------------------------------------------------------- # Core class # --------------------------------------------------------------------------- class SmartSpellChecker: """Language-aware OCR spell checker using pyspellchecker (no LLM).""" def __init__(self): if not _AVAILABLE: raise RuntimeError("pyspellchecker not installed") self.en = _en_spell self.de = _de_spell # --- Language detection --- def detect_word_lang(self, word: str) -> Lang: """Detect language of a single word using dual-dict heuristic.""" w = word.lower().strip(".,;:!?\"'()") if not w: return "unknown" in_en = bool(self.en.known([w])) in_de = bool(self.de.known([w])) if in_en and in_de: return "both" if in_en: return "en" if in_de: return "de" return "unknown" def detect_text_lang(self, text: str) -> Lang: """Detect dominant language of a text string (sentence/phrase).""" words = re.findall(r"[A-Za-zÄÖÜäöüß]+", text) if not words: return "unknown" en_count = 0 de_count = 0 for w in words: lang = self.detect_word_lang(w) if lang == "en": en_count += 1 elif lang == "de": de_count += 1 # "both" doesn't count for either if en_count > de_count: return "en" if de_count > en_count: return "de" if en_count == de_count and en_count > 0: return "both" return "unknown" # --- Single-word correction --- def _known(self, word: str) -> bool: """True if word is known in EN or DE dictionary, or is a known abbreviation.""" w = word.lower() if bool(self.en.known([w])) or bool(self.de.known([w])): return True # Also accept known abbreviations (sth, sb, adj, etc.) try: from cv_ocr_engines import _KNOWN_ABBREVIATIONS if w in _KNOWN_ABBREVIATIONS: return True except ImportError: pass return False def _known_in(self, word: str, lang: str) -> bool: """True if word is known in a specific language dictionary.""" w = word.lower() spell = self.en if lang == "en" else self.de return bool(spell.known([w])) def correct_word(self, word: str, lang: str = "en", prev_word: str = "", next_word: str = "") -> Optional[str]: """Correct a single word for the given language. Returns None if no correction needed, or the corrected string. Args: word: The word to check/correct lang: Expected language ("en" or "de") prev_word: Previous word (for context) next_word: Next word (for context) """ if not word or not word.strip(): return None # Skip numbers, abbreviations with dots, very short tokens if word.isdigit() or '.' in word: return None has_suspicious = any(ch in _SUSPICIOUS_CHARS for ch in word) # 1. Already known → no fix if self._known(word): # But check a/I disambiguation for single-char words if word.lower() in ('l', '|') and next_word: return self._disambiguate_a_I(word, next_word) return None # 2. Digit/pipe substitution if has_suspicious: if word == '|': return 'I' # Try single-char substitutions for i, ch in enumerate(word): if ch not in _DIGIT_SUBS: continue for replacement in _DIGIT_SUBS[ch]: candidate = word[:i] + replacement + word[i + 1:] if self._known(candidate): return candidate # Try multi-char substitution (e.g., "sch00l" → "school") multi = self._try_multi_digit_sub(word) if multi: return multi # 3. Umlaut correction (German) if lang == "de" and len(word) >= 3 and word.isalpha(): umlaut_fix = self._try_umlaut_fix(word) if umlaut_fix: return umlaut_fix # 4. General spell correction if not has_suspicious and len(word) >= 3 and word.isalpha(): # Safety: don't correct if the word is valid in the OTHER language # (either directly or via umlaut fix) other_lang = "de" if lang == "en" else "en" if self._known_in(word, other_lang): return None if other_lang == "de" and self._try_umlaut_fix(word): return None # has a valid DE umlaut variant → don't touch spell = self.en if lang == "en" else self.de correction = spell.correction(word.lower()) if correction and correction != word.lower(): if word[0].isupper(): correction = correction[0].upper() + correction[1:] if self._known(correction): return correction return None # --- Multi-digit substitution --- def _try_multi_digit_sub(self, word: str) -> Optional[str]: """Try replacing multiple digits simultaneously.""" positions = [(i, ch) for i, ch in enumerate(word) if ch in _DIGIT_SUBS] if len(positions) < 1 or len(positions) > 4: return None # Try all combinations (max 2^4 = 16 for 4 positions) chars = list(word) best = None self._multi_sub_recurse(chars, positions, 0, best_result=[None]) return self._multi_sub_recurse_result _multi_sub_recurse_result: Optional[str] = None def _try_multi_digit_sub(self, word: str) -> Optional[str]: """Try replacing multiple digits simultaneously using BFS.""" positions = [(i, ch) for i, ch in enumerate(word) if ch in _DIGIT_SUBS] if not positions or len(positions) > 4: return None # BFS over substitution combinations queue = [list(word)] for pos, ch in positions: next_queue = [] for current in queue: # Keep original next_queue.append(current[:]) # Try each substitution for repl in _DIGIT_SUBS[ch]: variant = current[:] variant[pos] = repl next_queue.append(variant) queue = next_queue # Check which combinations produce known words for combo in queue: candidate = "".join(combo) if candidate != word and self._known(candidate): return candidate return None # --- Umlaut fix --- def _try_umlaut_fix(self, word: str) -> Optional[str]: """Try single-char umlaut substitutions for German words.""" for i, ch in enumerate(word): if ch in _UMLAUT_MAP: candidate = word[:i] + _UMLAUT_MAP[ch] + word[i + 1:] if self._known(candidate): return candidate return None # --- Boundary repair (shifted word boundaries) --- def _try_boundary_repair(self, word1: str, word2: str) -> Optional[Tuple[str, str]]: """Fix shifted word boundaries between adjacent tokens. OCR sometimes shifts the boundary: "at sth." → "ats th." Try moving 1-2 chars from end of word1 to start of word2 and vice versa. Returns (fixed_word1, fixed_word2) or None. """ # Import known abbreviations for vocabulary context try: from cv_ocr_engines import _KNOWN_ABBREVIATIONS except ImportError: _KNOWN_ABBREVIATIONS = set() # Strip trailing punctuation for checking, preserve for result w2_stripped = word2.rstrip(".,;:!?") w2_punct = word2[len(w2_stripped):] # Try shifting 1-2 chars from word1 → word2 for shift in (1, 2): if len(word1) <= shift: continue new_w1 = word1[:-shift] new_w2_base = word1[-shift:] + w2_stripped w1_ok = self._known(new_w1) or new_w1.lower() in _KNOWN_ABBREVIATIONS w2_ok = self._known(new_w2_base) or new_w2_base.lower() in _KNOWN_ABBREVIATIONS if w1_ok and w2_ok: return (new_w1, new_w2_base + w2_punct) # Try shifting 1-2 chars from word2 → word1 for shift in (1, 2): if len(w2_stripped) <= shift: continue new_w1 = word1 + w2_stripped[:shift] new_w2_base = w2_stripped[shift:] w1_ok = self._known(new_w1) or new_w1.lower() in _KNOWN_ABBREVIATIONS w2_ok = self._known(new_w2_base) or new_w2_base.lower() in _KNOWN_ABBREVIATIONS if w1_ok and w2_ok: return (new_w1, new_w2_base + w2_punct) return None # --- Context-based word split for ambiguous merges --- # Patterns where a valid word is actually "a" + adjective/noun _ARTICLE_SPLIT_CANDIDATES = { # word → (article, remainder) — only when followed by a compatible word "anew": ("a", "new"), "areal": ("a", "real"), "alive": None, # genuinely one word, never split "alone": None, "aware": None, "alike": None, "apart": None, "aside": None, "above": None, "about": None, "among": None, "along": None, } def _try_context_split(self, word: str, next_word: str, prev_word: str) -> Optional[str]: """Split words like 'anew' → 'a new' when context indicates a merge. Only splits when: - The word is in the split candidates list - The following word makes sense as a noun (for "a + adj + noun" pattern) - OR the word is unknown and can be split into article + known word """ w_lower = word.lower() # Check explicit candidates if w_lower in self._ARTICLE_SPLIT_CANDIDATES: split = self._ARTICLE_SPLIT_CANDIDATES[w_lower] if split is None: return None # explicitly marked as "don't split" article, remainder = split # Only split if followed by a word (noun pattern) if next_word and next_word[0].islower(): return f"{article} {remainder}" # Also split if remainder + next_word makes a common phrase if next_word and self._known(next_word): return f"{article} {remainder}" # Generic: if word starts with 'a' and rest is a known adjective/word if (len(word) >= 4 and word[0].lower() == 'a' and not self._known(word) # only for UNKNOWN words and self._known(word[1:])): return f"a {word[1:]}" return None # --- a/I disambiguation --- def _disambiguate_a_I(self, token: str, next_word: str) -> Optional[str]: """Disambiguate 'a' vs 'I' (and OCR variants like 'l', '|').""" nw = next_word.lower().strip(".,;:!?") if nw in _I_FOLLOWERS: return "I" if nw in _A_FOLLOWERS: return "a" # Fallback: check if next word is more commonly a verb (→I) or noun/adj (→a) # Simple heuristic: if next word starts with uppercase (and isn't first in sentence) # it's likely a German noun following "I"... but in English context, uppercase # after "I" is unusual. return None # uncertain, don't change # --- Full text correction --- def correct_text(self, text: str, lang: str = "en") -> CorrectionResult: """Correct a full text string (field value). Three passes: 1. Boundary repair — fix shifted word boundaries between adjacent tokens 2. Context split — split ambiguous merges (anew → a new) 3. Per-word correction — spell check individual words Args: text: The text to correct lang: Expected language ("en" or "de") """ if not text or not text.strip(): return CorrectionResult(text, text, "unknown", False) detected = self.detect_text_lang(text) if lang == "auto" else lang effective_lang = detected if detected in ("en", "de") else "en" changes: List[str] = [] tokens = list(_TOKEN_RE.finditer(text)) # Extract token list: [(word, separator), ...] token_list: List[List[str]] = [] # [[word, sep], ...] for m in tokens: token_list.append([m.group(1), m.group(2)]) # --- Pass 1: Boundary repair between adjacent unknown words --- # Import abbreviations for the heuristic below try: from cv_ocr_engines import _KNOWN_ABBREVIATIONS as _ABBREVS except ImportError: _ABBREVS = set() for i in range(len(token_list) - 1): w1 = token_list[i][0] w2_raw = token_list[i + 1][0] # Include trailing punct from separator in w2 for abbreviation matching # e.g., "ats" + " " + "th" + "." → try repair("ats", "th.") w2_with_punct = w2_raw + token_list[i + 1][1].rstrip(" ") # Skip if both are known AND neither is suspiciously short (≤3 chars) # Short known words like "ats", "th" may be OCR boundary errors both_known = self._known(w1) and self._known(w2_raw) both_long = len(w1) > 3 and len(w2_raw) > 3 if both_known and both_long: continue # Try with punctuation first (for abbreviations like "sth.") repair = self._try_boundary_repair(w1, w2_with_punct) if not repair and w2_with_punct != w2_raw: repair = self._try_boundary_repair(w1, w2_raw) if repair: new_w1, new_w2_full = repair # Quality gate: only accept if repair is actually better # Better = at least one result is a known abbreviation, or # both results are longer/more common than originals new_w2_base = new_w2_full.rstrip(".,;:!?") old_score = (len(w1) >= 3) + (len(w2_raw) >= 3) new_score = ( (self._known(new_w1) or new_w1.lower() in _ABBREVS) + (self._known(new_w2_base) or new_w2_base.lower() in _ABBREVS) ) # Accept if new pair scores higher, or if it includes an abbreviation has_abbrev = new_w1.lower() in _ABBREVS or new_w2_base.lower() in _ABBREVS if new_score >= old_score or has_abbrev: new_w2_punct = new_w2_full[len(new_w2_base):] changes.append(f"{w1} {w2_raw}→{new_w1} {new_w2_base}") token_list[i][0] = new_w1 token_list[i + 1][0] = new_w2_base if new_w2_punct: token_list[i + 1][1] = new_w2_punct + token_list[i + 1][1].lstrip(".,;:!?") # --- Pass 2: Context split (anew → a new) --- expanded: List[List[str]] = [] for i, (word, sep) in enumerate(token_list): next_word = token_list[i + 1][0] if i + 1 < len(token_list) else "" prev_word = token_list[i - 1][0] if i > 0 else "" split = self._try_context_split(word, next_word, prev_word) if split and split != word: changes.append(f"{word}→{split}") expanded.append([split, sep]) else: expanded.append([word, sep]) token_list = expanded # --- Pass 3: Per-word correction --- parts: List[str] = [] for i, (word, sep) in enumerate(token_list): next_word = token_list[i + 1][0] if i + 1 < len(token_list) else "" prev_word = token_list[i - 1][0] if i > 0 else "" correction = self.correct_word( word, lang=effective_lang, prev_word=prev_word, next_word=next_word, ) if correction and correction != word: changes.append(f"{word}→{correction}") parts.append(correction) else: parts.append(word) parts.append(sep) # Append any trailing text last_end = tokens[-1].end() if tokens else 0 if last_end < len(text): parts.append(text[last_end:]) corrected = "".join(parts) return CorrectionResult( original=text, corrected=corrected, lang_detected=detected, changed=corrected != text, changes=changes, ) # --- Vocabulary entry correction --- def correct_vocab_entry(self, english: str, german: str, example: str = "") -> Dict[str, CorrectionResult]: """Correct a full vocabulary entry (EN + DE + example). Uses column position to determine language — the most reliable signal. """ results = {} results["english"] = self.correct_text(english, lang="en") results["german"] = self.correct_text(german, lang="de") if example: # For examples, auto-detect language results["example"] = self.correct_text(example, lang="auto") return results