SmartSpellChecker: frequency-based boundary repair for valid word pairs

Previously, boundary repair was skipped when both words were valid dictionary words (e.g., "Pound sand", "wit hit", "done euro"). Now uses word-frequency scoring (product of bigram frequencies) to decide if the repair produces a more common word pair. Threshold: repair accepted when new pair is >5x more frequent, or when repair produces a known abbreviation. New fixes: Pound sand→Pounds and (2000x), wit hit→with it (100000x), done euro→one euro (7x). 43 tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 07:00:22 +02:00
parent 7b294f9150
commit 31089df36f
2 changed files with 39 additions and 20 deletions
--- a/klausur-service/backend/smart_spell.py
+++ b/klausur-service/backend/smart_spell.py
@@ -166,6 +166,11 @@ class SmartSpellChecker:
            pass
        return False

+    def _word_freq(self, word: str) -> float:
+        """Get word frequency (max of EN and DE)."""
+        w = word.lower()
+        return max(self.en.word_usage_frequency(w), self.de.word_usage_frequency(w))
+
    def _known_in(self, word: str, lang: str) -> bool:
        """True if word is known in a specific language dictionary."""
        w = word.lower()
@@ -450,32 +455,31 @@ class SmartSpellChecker:
            w1 = token_list[i][0]
            w2_raw = token_list[i + 1][0]
            # Include trailing punct from separator in w2 for abbreviation matching
-            # e.g., "ats" + " " + "th" + "." → try repair("ats", "th.")
            w2_with_punct = w2_raw + token_list[i + 1][1].rstrip(" ")
-            # Skip if both are known AND neither is suspiciously short (≤3 chars)
-            # Short known words like "ats", "th" may be OCR boundary errors
-            both_known = self._known(w1) and self._known(w2_raw)
-            both_long = len(w1) > 3 and len(w2_raw) > 3
-            if both_known and both_long:
-                continue
-            # Try with punctuation first (for abbreviations like "sth.")
+
+            # Try boundary repair — always, even if both words are valid.
+            # Use word-frequency scoring to decide if repair is better.
            repair = self._try_boundary_repair(w1, w2_with_punct)
            if not repair and w2_with_punct != w2_raw:
                repair = self._try_boundary_repair(w1, w2_raw)
            if repair:
                new_w1, new_w2_full = repair
-                # Quality gate: only accept if repair is actually better
-                # Better = at least one result is a known abbreviation, or
-                # both results are longer/more common than originals
                new_w2_base = new_w2_full.rstrip(".,;:!?")
-                old_score = (len(w1) >= 3) + (len(w2_raw) >= 3)
-                new_score = (
-                    (self._known(new_w1) or new_w1.lower() in _ABBREVS)
-                    + (self._known(new_w2_base) or new_w2_base.lower() in _ABBREVS)
-                )
-                # Accept if new pair scores higher, or if it includes an abbreviation
+
+                # Frequency-based scoring: product of word frequencies
+                # Higher product = more common word pair = better
+                old_freq = self._word_freq(w1) * self._word_freq(w2_raw)
+                new_freq = self._word_freq(new_w1) * self._word_freq(new_w2_base)
+
+                # Abbreviation bonus: if repair produces a known abbreviation,
+                # add a large frequency boost (abbreviations have zero frequency)
                has_abbrev = new_w1.lower() in _ABBREVS or new_w2_base.lower() in _ABBREVS
-                if new_score >= old_score or has_abbrev:
+                if has_abbrev:
+                    new_freq = max(new_freq, old_freq * 10)
+
+                # Accept if repair produces a more frequent word pair
+                # (threshold: at least 5x more frequent to avoid false positives)
+                if new_freq > old_freq * 5 or has_abbrev:
                    new_w2_punct = new_w2_full[len(new_w2_base):]
                    changes.append(f"{w1} {w2_raw}→{new_w1} {new_w2_base}")
                    token_list[i][0] = new_w1