diff --git a/klausur-service/backend/smart_spell.py b/klausur-service/backend/smart_spell.py index 3b5cf18..e70ca47 100644 --- a/klausur-service/backend/smart_spell.py +++ b/klausur-service/backend/smart_spell.py @@ -153,9 +153,18 @@ class SmartSpellChecker: # --- Single-word correction --- def _known(self, word: str) -> bool: - """True if word is known in EN or DE dictionary.""" + """True if word is known in EN or DE dictionary, or is a known abbreviation.""" w = word.lower() - return bool(self.en.known([w])) or bool(self.de.known([w])) + if bool(self.en.known([w])) or bool(self.de.known([w])): + return True + # Also accept known abbreviations (sth, sb, adj, etc.) + try: + from cv_ocr_engines import _KNOWN_ABBREVIATIONS + if w in _KNOWN_ABBREVIATIONS: + return True + except ImportError: + pass + return False def _known_in(self, word: str, lang: str) -> bool: """True if word is known in a specific language dictionary.""" @@ -289,6 +298,104 @@ class SmartSpellChecker: return candidate return None + # --- Boundary repair (shifted word boundaries) --- + + def _try_boundary_repair(self, word1: str, word2: str) -> Optional[Tuple[str, str]]: + """Fix shifted word boundaries between adjacent tokens. + + OCR sometimes shifts the boundary: "at sth." → "ats th." + Try moving 1-2 chars from end of word1 to start of word2 and vice versa. + Returns (fixed_word1, fixed_word2) or None. + """ + # Import known abbreviations for vocabulary context + try: + from cv_ocr_engines import _KNOWN_ABBREVIATIONS + except ImportError: + _KNOWN_ABBREVIATIONS = set() + + # Strip trailing punctuation for checking, preserve for result + w2_stripped = word2.rstrip(".,;:!?") + w2_punct = word2[len(w2_stripped):] + + # Try shifting 1-2 chars from word1 → word2 + for shift in (1, 2): + if len(word1) <= shift: + continue + new_w1 = word1[:-shift] + new_w2_base = word1[-shift:] + w2_stripped + + w1_ok = self._known(new_w1) or new_w1.lower() in _KNOWN_ABBREVIATIONS + w2_ok = self._known(new_w2_base) or new_w2_base.lower() in _KNOWN_ABBREVIATIONS + + if w1_ok and w2_ok: + return (new_w1, new_w2_base + w2_punct) + + # Try shifting 1-2 chars from word2 → word1 + for shift in (1, 2): + if len(w2_stripped) <= shift: + continue + new_w1 = word1 + w2_stripped[:shift] + new_w2_base = w2_stripped[shift:] + + w1_ok = self._known(new_w1) or new_w1.lower() in _KNOWN_ABBREVIATIONS + w2_ok = self._known(new_w2_base) or new_w2_base.lower() in _KNOWN_ABBREVIATIONS + + if w1_ok and w2_ok: + return (new_w1, new_w2_base + w2_punct) + + return None + + # --- Context-based word split for ambiguous merges --- + + # Patterns where a valid word is actually "a" + adjective/noun + _ARTICLE_SPLIT_CANDIDATES = { + # word → (article, remainder) — only when followed by a compatible word + "anew": ("a", "new"), + "areal": ("a", "real"), + "alive": None, # genuinely one word, never split + "alone": None, + "aware": None, + "alike": None, + "apart": None, + "aside": None, + "above": None, + "about": None, + "among": None, + "along": None, + } + + def _try_context_split(self, word: str, next_word: str, + prev_word: str) -> Optional[str]: + """Split words like 'anew' → 'a new' when context indicates a merge. + + Only splits when: + - The word is in the split candidates list + - The following word makes sense as a noun (for "a + adj + noun" pattern) + - OR the word is unknown and can be split into article + known word + """ + w_lower = word.lower() + + # Check explicit candidates + if w_lower in self._ARTICLE_SPLIT_CANDIDATES: + split = self._ARTICLE_SPLIT_CANDIDATES[w_lower] + if split is None: + return None # explicitly marked as "don't split" + article, remainder = split + # Only split if followed by a word (noun pattern) + if next_word and next_word[0].islower(): + return f"{article} {remainder}" + # Also split if remainder + next_word makes a common phrase + if next_word and self._known(next_word): + return f"{article} {remainder}" + + # Generic: if word starts with 'a' and rest is a known adjective/word + if (len(word) >= 4 and word[0].lower() == 'a' + and not self._known(word) # only for UNKNOWN words + and self._known(word[1:])): + return f"a {word[1:]}" + + return None + # --- a/I disambiguation --- def _disambiguate_a_I(self, token: str, next_word: str) -> Optional[str]: @@ -309,6 +416,11 @@ class SmartSpellChecker: def correct_text(self, text: str, lang: str = "en") -> CorrectionResult: """Correct a full text string (field value). + Three passes: + 1. Boundary repair — fix shifted word boundaries between adjacent tokens + 2. Context split — split ambiguous merges (anew → a new) + 3. Per-word correction — spell check individual words + Args: text: The text to correct lang: Expected language ("en" or "de") @@ -317,25 +429,88 @@ class SmartSpellChecker: return CorrectionResult(text, text, "unknown", False) detected = self.detect_text_lang(text) if lang == "auto" else lang + effective_lang = detected if detected in ("en", "de") else "en" - parts: List[str] = [] changes: List[str] = [] tokens = list(_TOKEN_RE.finditer(text)) - for idx, m in enumerate(tokens): - token, sep = m.group(1), m.group(2) - next_word = tokens[idx + 1].group(1) if idx + 1 < len(tokens) else "" - prev_word = tokens[idx - 1].group(1) if idx > 0 else "" + # Extract token list: [(word, separator), ...] + token_list: List[List[str]] = [] # [[word, sep], ...] + for m in tokens: + token_list.append([m.group(1), m.group(2)]) + + # --- Pass 1: Boundary repair between adjacent unknown words --- + # Import abbreviations for the heuristic below + try: + from cv_ocr_engines import _KNOWN_ABBREVIATIONS as _ABBREVS + except ImportError: + _ABBREVS = set() + + for i in range(len(token_list) - 1): + w1 = token_list[i][0] + w2_raw = token_list[i + 1][0] + # Include trailing punct from separator in w2 for abbreviation matching + # e.g., "ats" + " " + "th" + "." → try repair("ats", "th.") + w2_with_punct = w2_raw + token_list[i + 1][1].rstrip(" ") + # Skip if both are known AND neither is suspiciously short (≤3 chars) + # Short known words like "ats", "th" may be OCR boundary errors + both_known = self._known(w1) and self._known(w2_raw) + both_long = len(w1) > 3 and len(w2_raw) > 3 + if both_known and both_long: + continue + # Try with punctuation first (for abbreviations like "sth.") + repair = self._try_boundary_repair(w1, w2_with_punct) + if not repair and w2_with_punct != w2_raw: + repair = self._try_boundary_repair(w1, w2_raw) + if repair: + new_w1, new_w2_full = repair + # Quality gate: only accept if repair is actually better + # Better = at least one result is a known abbreviation, or + # both results are longer/more common than originals + new_w2_base = new_w2_full.rstrip(".,;:!?") + old_score = (len(w1) >= 3) + (len(w2_raw) >= 3) + new_score = ( + (self._known(new_w1) or new_w1.lower() in _ABBREVS) + + (self._known(new_w2_base) or new_w2_base.lower() in _ABBREVS) + ) + # Accept if new pair scores higher, or if it includes an abbreviation + has_abbrev = new_w1.lower() in _ABBREVS or new_w2_base.lower() in _ABBREVS + if new_score >= old_score or has_abbrev: + new_w2_punct = new_w2_full[len(new_w2_base):] + changes.append(f"{w1} {w2_raw}→{new_w1} {new_w2_base}") + token_list[i][0] = new_w1 + token_list[i + 1][0] = new_w2_base + if new_w2_punct: + token_list[i + 1][1] = new_w2_punct + token_list[i + 1][1].lstrip(".,;:!?") + + # --- Pass 2: Context split (anew → a new) --- + expanded: List[List[str]] = [] + for i, (word, sep) in enumerate(token_list): + next_word = token_list[i + 1][0] if i + 1 < len(token_list) else "" + prev_word = token_list[i - 1][0] if i > 0 else "" + split = self._try_context_split(word, next_word, prev_word) + if split and split != word: + changes.append(f"{word}→{split}") + expanded.append([split, sep]) + else: + expanded.append([word, sep]) + token_list = expanded + + # --- Pass 3: Per-word correction --- + parts: List[str] = [] + for i, (word, sep) in enumerate(token_list): + next_word = token_list[i + 1][0] if i + 1 < len(token_list) else "" + prev_word = token_list[i - 1][0] if i > 0 else "" correction = self.correct_word( - token, lang=detected if detected in ("en", "de") else "en", + word, lang=effective_lang, prev_word=prev_word, next_word=next_word, ) - if correction and correction != token: - changes.append(f"{token}→{correction}") + if correction and correction != word: + changes.append(f"{word}→{correction}") parts.append(correction) else: - parts.append(token) + parts.append(word) parts.append(sep) # Append any trailing text diff --git a/klausur-service/backend/tests/test_smart_spell.py b/klausur-service/backend/tests/test_smart_spell.py index 2816803..6f1d27e 100644 --- a/klausur-service/backend/tests/test_smart_spell.py +++ b/klausur-service/backend/tests/test_smart_spell.py @@ -156,6 +156,67 @@ class TestFullTextCorrection: assert result.corrected == "" +# ─── Boundary Repair ─────────────────────────────────────────────────────── + + +class TestBoundaryRepair: + + def test_ats_th_to_at_sth(self, sc): + """'ats th.' → 'at sth.' — shifted boundary with abbreviation.""" + result = sc.correct_text("be good ats th.", "en") + assert "at sth." in result.corrected, f"Expected 'at sth.' in '{result.corrected}'" + + def test_no_repair_if_both_known(self, sc): + """Don't repair if both words are already valid.""" + result = sc.correct_text("at the", "en") + assert result.corrected == "at the" + assert not result.changed + + def test_boundary_shift_right(self, sc): + """Shift chars from word1 to word2.""" + repair = sc._try_boundary_repair("ats", "th") + assert repair == ("at", "sth") or repair == ("at", "sth"), f"Got {repair}" + + def test_boundary_shift_with_punct(self, sc): + """Preserve punctuation during boundary repair.""" + repair = sc._try_boundary_repair("ats", "th.") + assert repair is not None + assert repair[0] == "at" + assert repair[1] == "sth." + + +# ─── Context Split ────────────────────────────────────────────────────────── + + +class TestContextSplit: + + def test_anew_to_a_new(self, sc): + """'anew' → 'a new' when followed by a noun.""" + result = sc.correct_text("anew book", "en") + assert result.corrected == "a new book", f"Got '{result.corrected}'" + + def test_anew_standalone_no_split(self, sc): + """'anew' at end of phrase might genuinely be 'anew'.""" + # "start anew" — no next word to indicate split + # This is ambiguous, so we accept either behavior + pass + + def test_alive_not_split(self, sc): + """'alive' should never be split to 'a live'.""" + result = sc.correct_text("alive and well", "en") + assert "alive" in result.corrected + + def test_alone_not_split(self, sc): + """'alone' should never be split.""" + result = sc.correct_text("alone in the dark", "en") + assert "alone" in result.corrected + + def test_about_not_split(self, sc): + """'about' should never be split to 'a bout'.""" + result = sc.correct_text("about time", "en") + assert "about" in result.corrected + + # ─── Vocab Entry Correction ─────────────────────────────────────────────────