""" SmartSpellChecker Text — full text correction, boundary repair, context split. Extracted from smart_spell.py for modularity. Lizenz: Apache 2.0 (kommerziell nutzbar) """ import re from typing import Dict, List, Optional, Tuple from smart_spell_core import ( _SmartSpellCoreBase, _TOKEN_RE, CorrectionResult, Lang, ) class SmartSpellChecker(_SmartSpellCoreBase): """Language-aware OCR spell checker using pyspellchecker (no LLM). Inherits single-word correction from _SmartSpellCoreBase. Adds text-level passes: boundary repair, context split, full correction. """ # --- Boundary repair (shifted word boundaries) --- def _try_boundary_repair(self, word1: str, word2: str) -> Optional[Tuple[str, str]]: """Fix shifted word boundaries between adjacent tokens. OCR sometimes shifts the boundary: "at sth." -> "ats th." Try moving 1-2 chars from end of word1 to start of word2 and vice versa. Returns (fixed_word1, fixed_word2) or None. """ # Import known abbreviations for vocabulary context try: from cv_ocr_engines import _KNOWN_ABBREVIATIONS except ImportError: _KNOWN_ABBREVIATIONS = set() # Strip trailing punctuation for checking, preserve for result w2_stripped = word2.rstrip(".,;:!?") w2_punct = word2[len(w2_stripped):] # Try shifting 1-2 chars from word1 -> word2 for shift in (1, 2): if len(word1) <= shift: continue new_w1 = word1[:-shift] new_w2_base = word1[-shift:] + w2_stripped w1_ok = self._known(new_w1) or new_w1.lower() in _KNOWN_ABBREVIATIONS w2_ok = self._known(new_w2_base) or new_w2_base.lower() in _KNOWN_ABBREVIATIONS if w1_ok and w2_ok: return (new_w1, new_w2_base + w2_punct) # Try shifting 1-2 chars from word2 -> word1 for shift in (1, 2): if len(w2_stripped) <= shift: continue new_w1 = word1 + w2_stripped[:shift] new_w2_base = w2_stripped[shift:] w1_ok = self._known(new_w1) or new_w1.lower() in _KNOWN_ABBREVIATIONS w2_ok = self._known(new_w2_base) or new_w2_base.lower() in _KNOWN_ABBREVIATIONS if w1_ok and w2_ok: return (new_w1, new_w2_base + w2_punct) return None # --- Context-based word split for ambiguous merges --- # Patterns where a valid word is actually "a" + adjective/noun _ARTICLE_SPLIT_CANDIDATES = { # word -> (article, remainder) -- only when followed by a compatible word "anew": ("a", "new"), "areal": ("a", "real"), "alive": None, # genuinely one word, never split "alone": None, "aware": None, "alike": None, "apart": None, "aside": None, "above": None, "about": None, "among": None, "along": None, } def _try_context_split(self, word: str, next_word: str, prev_word: str) -> Optional[str]: """Split words like 'anew' -> 'a new' when context indicates a merge. Only splits when: - The word is in the split candidates list - The following word makes sense as a noun (for "a + adj + noun" pattern) - OR the word is unknown and can be split into article + known word """ w_lower = word.lower() # Check explicit candidates if w_lower in self._ARTICLE_SPLIT_CANDIDATES: split = self._ARTICLE_SPLIT_CANDIDATES[w_lower] if split is None: return None # explicitly marked as "don't split" article, remainder = split # Only split if followed by a word (noun pattern) if next_word and next_word[0].islower(): return f"{article} {remainder}" # Also split if remainder + next_word makes a common phrase if next_word and self._known(next_word): return f"{article} {remainder}" # Generic: if word starts with 'a' and rest is a known adjective/word if (len(word) >= 4 and word[0].lower() == 'a' and not self._known(word) # only for UNKNOWN words and self._known(word[1:])): return f"a {word[1:]}" return None # --- Full text correction --- def correct_text(self, text: str, lang: str = "en") -> CorrectionResult: """Correct a full text string (field value). Three passes: 1. Boundary repair -- fix shifted word boundaries between adjacent tokens 2. Context split -- split ambiguous merges (anew -> a new) 3. Per-word correction -- spell check individual words """ if not text or not text.strip(): return CorrectionResult(text, text, "unknown", False) detected = self.detect_text_lang(text) if lang == "auto" else lang effective_lang = detected if detected in ("en", "de") else "en" changes: List[str] = [] tokens = list(_TOKEN_RE.finditer(text)) # Extract token list: [(word, separator), ...] token_list: List[List[str]] = [] # [[word, sep], ...] for m in tokens: token_list.append([m.group(1), m.group(2)]) # --- Pass 1: Boundary repair between adjacent unknown words --- # Import abbreviations for the heuristic below try: from cv_ocr_engines import _KNOWN_ABBREVIATIONS as _ABBREVS except ImportError: _ABBREVS = set() for i in range(len(token_list) - 1): w1 = token_list[i][0] w2_raw = token_list[i + 1][0] # Skip boundary repair for IPA/bracket content # Brackets may be in the token OR in the adjacent separators sep_before_w1 = token_list[i - 1][1] if i > 0 else "" sep_after_w1 = token_list[i][1] sep_after_w2 = token_list[i + 1][1] has_bracket = ( '[' in w1 or ']' in w1 or '[' in w2_raw or ']' in w2_raw or ']' in sep_after_w1 # w1 text was inside [brackets] or '[' in sep_after_w1 # w2 starts a bracket or ']' in sep_after_w2 # w2 text was inside [brackets] or '[' in sep_before_w1 # w1 starts a bracket ) if has_bracket: continue # Include trailing punct from separator in w2 for abbreviation matching w2_with_punct = w2_raw + token_list[i + 1][1].rstrip(" ") # Try boundary repair -- always, even if both words are valid. # Use word-frequency scoring to decide if repair is better. repair = self._try_boundary_repair(w1, w2_with_punct) if not repair and w2_with_punct != w2_raw: repair = self._try_boundary_repair(w1, w2_raw) if repair: new_w1, new_w2_full = repair new_w2_base = new_w2_full.rstrip(".,;:!?") # Frequency-based scoring: product of word frequencies # Higher product = more common word pair = better old_freq = self._word_freq(w1) * self._word_freq(w2_raw) new_freq = self._word_freq(new_w1) * self._word_freq(new_w2_base) # Abbreviation bonus: if repair produces a known abbreviation has_abbrev = new_w1.lower() in _ABBREVS or new_w2_base.lower() in _ABBREVS if has_abbrev: # Accept abbreviation repair ONLY if at least one of the # original words is rare/unknown (prevents "Can I" -> "Ca nI" # where both original words are common and correct). RARE_THRESHOLD = 1e-6 orig_both_common = ( self._word_freq(w1) > RARE_THRESHOLD and self._word_freq(w2_raw) > RARE_THRESHOLD ) if not orig_both_common: new_freq = max(new_freq, old_freq * 10) else: has_abbrev = False # both originals common -> don't trust # Accept if repair produces a more frequent word pair # (threshold: at least 5x more frequent to avoid false positives) if new_freq > old_freq * 5: new_w2_punct = new_w2_full[len(new_w2_base):] changes.append(f"{w1} {w2_raw}\u2192{new_w1} {new_w2_base}") token_list[i][0] = new_w1 token_list[i + 1][0] = new_w2_base if new_w2_punct: token_list[i + 1][1] = new_w2_punct + token_list[i + 1][1].lstrip(".,;:!?") # --- Pass 2: Context split (anew -> a new) --- expanded: List[List[str]] = [] for i, (word, sep) in enumerate(token_list): next_word = token_list[i + 1][0] if i + 1 < len(token_list) else "" prev_word = token_list[i - 1][0] if i > 0 else "" split = self._try_context_split(word, next_word, prev_word) if split and split != word: changes.append(f"{word}\u2192{split}") expanded.append([split, sep]) else: expanded.append([word, sep]) token_list = expanded # --- Pass 3: Per-word correction --- parts: List[str] = [] # Preserve any leading text before the first token match first_start = tokens[0].start() if tokens else 0 if first_start > 0: parts.append(text[:first_start]) for i, (word, sep) in enumerate(token_list): # Skip words inside IPA brackets (brackets land in separators) prev_sep = token_list[i - 1][1] if i > 0 else "" if '[' in prev_sep or ']' in sep: parts.append(word) parts.append(sep) continue next_word = token_list[i + 1][0] if i + 1 < len(token_list) else "" prev_word = token_list[i - 1][0] if i > 0 else "" correction = self.correct_word( word, lang=effective_lang, prev_word=prev_word, next_word=next_word, ) if correction and correction != word: changes.append(f"{word}\u2192{correction}") parts.append(correction) else: parts.append(word) parts.append(sep) # Append any trailing text last_end = tokens[-1].end() if tokens else 0 if last_end < len(text): parts.append(text[last_end:]) corrected = "".join(parts) return CorrectionResult( original=text, corrected=corrected, lang_detected=detected, changed=corrected != text, changes=changes, ) # --- Vocabulary entry correction --- def correct_vocab_entry(self, english: str, german: str, example: str = "") -> Dict[str, CorrectionResult]: """Correct a full vocabulary entry (EN + DE + example). Uses column position to determine language -- the most reliable signal. """ results = {} results["english"] = self.correct_text(english, lang="en") results["german"] = self.correct_text(german, lang="de") if example: # For examples, auto-detect language results["example"] = self.correct_text(example, lang="auto") return results