SmartSpellChecker: frequency scoring, IPA protection, slash→l fix

Major improvements: - Frequency-based boundary repair: always tries repair, uses word frequency product to decide (Pound sand→Pounds and: 2000x better) - IPA bracket protection: words inside [brackets] are never modified, even when brackets land in tokenizer separators - Slash→l substitution: "p/" → "pl" for italic l misread as slash - Abbreviation guard uses rare-word threshold (freq < 1e-6) instead of binary known/unknown — prevents "Can I" → "Ca nI" while still fixing "ats th." → "at sth." - Tokenizer includes / character for slash-word detection 43 tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 07:36:39 +02:00
parent 31089df36f
commit 693803fb7c
1 changed files with 45 additions and 6 deletions
@@ -70,6 +70,7 @@ _DIGIT_SUBS: Dict[str, List[str]] = {
    '6': ['g', 'G'],
    '8': ['b', 'B'],
    '|': ['I', 'l'],
+    '/': ['l'],  # italic 'l' misread as slash (e.g. "p/" → "pl")
 }
 _SUSPICIOUS_CHARS = frozenset(_DIGIT_SUBS.keys())

@@ -79,8 +80,8 @@ _UMLAUT_MAP = {
    'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü',
 }

-# Tokenizer
-_TOKEN_RE = re.compile(r"([A-Za-zÄÖÜäöüß'|]+)([^A-Za-zÄÖÜäöüß'|]*)")
+# Tokenizer — includes | and / so OCR artifacts like "p/" are treated as words
+_TOKEN_RE = re.compile(r"([A-Za-zÄÖÜäöüß'|/]+)([^A-Za-zÄÖÜäöüß'|/]*)")


 # ---------------------------------------------------------------------------
@@ -196,6 +197,10 @@ class SmartSpellChecker:
        if word.isdigit() or '.' in word:
            return None

+        # Skip IPA/phonetic content in brackets
+        if '[' in word or ']' in word:
+            return None
+
        has_suspicious = any(ch in _SUSPICIOUS_CHARS for ch in word)

        # 1. Already known → no fix
@@ -454,6 +459,22 @@ class SmartSpellChecker:
        for i in range(len(token_list) - 1):
            w1 = token_list[i][0]
            w2_raw = token_list[i + 1][0]
+
+            # Skip boundary repair for IPA/bracket content
+            # Brackets may be in the token OR in the adjacent separators
+            sep_before_w1 = token_list[i - 1][1] if i > 0 else ""
+            sep_after_w1 = token_list[i][1]
+            sep_after_w2 = token_list[i + 1][1]
+            has_bracket = (
+                '[' in w1 or ']' in w1 or '[' in w2_raw or ']' in w2_raw
+                or ']' in sep_after_w1  # w1 text was inside [brackets]
+                or '[' in sep_after_w1  # w2 starts a bracket
+                or ']' in sep_after_w2  # w2 text was inside [brackets]
+                or '[' in sep_before_w1  # w1 starts a bracket
+            )
+            if has_bracket:
+                continue
+
            # Include trailing punct from separator in w2 for abbreviation matching
            w2_with_punct = w2_raw + token_list[i + 1][1].rstrip(" ")

@@ -471,15 +492,26 @@ class SmartSpellChecker:
                old_freq = self._word_freq(w1) * self._word_freq(w2_raw)
                new_freq = self._word_freq(new_w1) * self._word_freq(new_w2_base)

-                # Abbreviation bonus: if repair produces a known abbreviation,
-                # add a large frequency boost (abbreviations have zero frequency)
+                # Abbreviation bonus: if repair produces a known abbreviation
                has_abbrev = new_w1.lower() in _ABBREVS or new_w2_base.lower() in _ABBREVS
                if has_abbrev:
-                    new_freq = max(new_freq, old_freq * 10)
+                    # Accept abbreviation repair ONLY if at least one of the
+                    # original words is rare/unknown (prevents "Can I" → "Ca nI"
+                    # where both original words are common and correct).
+                    # "Rare" = frequency < 1e-6 (covers "ats", "th" but not "Can", "I")
+                    RARE_THRESHOLD = 1e-6
+                    orig_both_common = (
+                        self._word_freq(w1) > RARE_THRESHOLD
+                        and self._word_freq(w2_raw) > RARE_THRESHOLD
+                    )
+                    if not orig_both_common:
+                        new_freq = max(new_freq, old_freq * 10)
+                    else:
+                        has_abbrev = False  # both originals common → don't trust

                # Accept if repair produces a more frequent word pair
                # (threshold: at least 5x more frequent to avoid false positives)
-                if new_freq > old_freq * 5 or has_abbrev:
+                if new_freq > old_freq * 5:
                    new_w2_punct = new_w2_full[len(new_w2_base):]
                    changes.append(f"{w1} {w2_raw}→{new_w1} {new_w2_base}")
                    token_list[i][0] = new_w1
@@ -503,6 +535,13 @@ class SmartSpellChecker:
        # --- Pass 3: Per-word correction ---
        parts: List[str] = []
        for i, (word, sep) in enumerate(token_list):
+            # Skip words inside IPA brackets (brackets land in separators)
+            prev_sep = token_list[i - 1][1] if i > 0 else ""
+            if '[' in prev_sep or ']' in sep:
+                parts.append(word)
+                parts.append(sep)
+                continue
+
            next_word = token_list[i + 1][0] if i + 1 < len(token_list) else ""
            prev_word = token_list[i - 1][0] if i > 0 else ""