SmartSpellChecker: frequency scoring, IPA protection, slash→l fix

Major improvements: - Frequency-based boundary repair: always tries repair, uses word frequency product to decide (Pound sand→Pounds and: 2000x better) - IPA bracket protection: words inside [brackets] are never modified, even when brackets land in tokenizer separators - Slash→l substitution: "p/" → "pl" for italic l misread as slash - Abbreviation guard uses rare-word threshold (freq < 1e-6) instead of binary known/unknown — prevents "Can I" → "Ca nI" while still fixing "ats th." → "at sth." - Tokenizer includes / character for slash-word detection 43 tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 07:36:39 +02:00
parent 31089df36f
commit 693803fb7c
1 changed files with 45 additions and 6 deletions
@@ -70,6 +70,7 @@ _DIGIT_SUBS: Dict[str, List[str]] = {
    '6': ['g', 'G'],
    '8': ['b', 'B'],
    '|': ['I', 'l'],
    '/': ['l'],  # italic 'l' misread as slash (e.g. "p/" → "pl")
 }
 _SUSPICIOUS_CHARS = frozenset(_DIGIT_SUBS.keys())
@@ -79,8 +80,8 @@ _UMLAUT_MAP = {
    'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü',
 }
-# Tokenizer
+# Tokenizer — includes | and / so OCR artifacts like "p/" are treated as words
-_TOKEN_RE = re.compile(r"([A-Za-zÄÖÜäöüß'|]+)([^A-Za-zÄÖÜäöüß'|]*)")
+_TOKEN_RE = re.compile(r"([A-Za-zÄÖÜäöüß'|/]+)([^A-Za-zÄÖÜäöüß'|/]*)")
 # ---------------------------------------------------------------------------
@@ -196,6 +197,10 @@ class SmartSpellChecker:
        if word.isdigit() or '.' in word:
            return None
        # Skip IPA/phonetic content in brackets
        if '[' in word or ']' in word:
            return None
        has_suspicious = any(ch in _SUSPICIOUS_CHARS for ch in word)
        # 1. Already known → no fix
@@ -454,6 +459,22 @@ class SmartSpellChecker:
        for i in range(len(token_list) - 1):
            w1 = token_list[i][0]
            w2_raw = token_list[i + 1][0]
            # Skip boundary repair for IPA/bracket content
            # Brackets may be in the token OR in the adjacent separators
            sep_before_w1 = token_list[i - 1][1] if i > 0 else ""
            sep_after_w1 = token_list[i][1]
            sep_after_w2 = token_list[i + 1][1]
            has_bracket = (
                '[' in w1 or ']' in w1 or '[' in w2_raw or ']' in w2_raw
                or ']' in sep_after_w1  # w1 text was inside [brackets]
                or '[' in sep_after_w1  # w2 starts a bracket
                or ']' in sep_after_w2  # w2 text was inside [brackets]
                or '[' in sep_before_w1  # w1 starts a bracket
            )
            if has_bracket:
                continue
            # Include trailing punct from separator in w2 for abbreviation matching
            w2_with_punct = w2_raw + token_list[i + 1][1].rstrip(" ")
@@ -471,15 +492,26 @@ class SmartSpellChecker:
                old_freq = self._word_freq(w1) * self._word_freq(w2_raw)
                new_freq = self._word_freq(new_w1) * self._word_freq(new_w2_base)
-                # Abbreviation bonus: if repair produces a known abbreviation,
+                # Abbreviation bonus: if repair produces a known abbreviation
                # add a large frequency boost (abbreviations have zero frequency)
                has_abbrev = new_w1.lower() in _ABBREVS or new_w2_base.lower() in _ABBREVS
                if has_abbrev:
                    # Accept abbreviation repair ONLY if at least one of the
                    # original words is rare/unknown (prevents "Can I" → "Ca nI"
                    # where both original words are common and correct).
                    # "Rare" = frequency < 1e-6 (covers "ats", "th" but not "Can", "I")
                    RARE_THRESHOLD = 1e-6
                    orig_both_common = (
                        self._word_freq(w1) > RARE_THRESHOLD
                        and self._word_freq(w2_raw) > RARE_THRESHOLD
                    )
                    if not orig_both_common:
                        new_freq = max(new_freq, old_freq * 10)
                    else:
                        has_abbrev = False  # both originals common → don't trust
                # Accept if repair produces a more frequent word pair
                # (threshold: at least 5x more frequent to avoid false positives)
-                if new_freq > old_freq * 5 or has_abbrev:
+                if new_freq > old_freq * 5:
                    new_w2_punct = new_w2_full[len(new_w2_base):]
                    changes.append(f"{w1} {w2_raw}→{new_w1} {new_w2_base}")
                    token_list[i][0] = new_w1
@@ -503,6 +535,13 @@ class SmartSpellChecker:
        # --- Pass 3: Per-word correction ---
        parts: List[str] = []
        for i, (word, sep) in enumerate(token_list):
            # Skip words inside IPA brackets (brackets land in separators)
            prev_sep = token_list[i - 1][1] if i > 0 else ""
            if '[' in prev_sep or ']' in sep:
                parts.append(word)
                parts.append(sep)
                continue
            next_word = token_list[i + 1][0] if i + 1 < len(token_list) else ""
            prev_word = token_list[i - 1][0] if i > 0 else ""