diff --git a/klausur-service/backend/smart_spell.py b/klausur-service/backend/smart_spell.py index e70ca47..66f1f0b 100644 --- a/klausur-service/backend/smart_spell.py +++ b/klausur-service/backend/smart_spell.py @@ -166,6 +166,11 @@ class SmartSpellChecker: pass return False + def _word_freq(self, word: str) -> float: + """Get word frequency (max of EN and DE).""" + w = word.lower() + return max(self.en.word_usage_frequency(w), self.de.word_usage_frequency(w)) + def _known_in(self, word: str, lang: str) -> bool: """True if word is known in a specific language dictionary.""" w = word.lower() @@ -450,32 +455,31 @@ class SmartSpellChecker: w1 = token_list[i][0] w2_raw = token_list[i + 1][0] # Include trailing punct from separator in w2 for abbreviation matching - # e.g., "ats" + " " + "th" + "." → try repair("ats", "th.") w2_with_punct = w2_raw + token_list[i + 1][1].rstrip(" ") - # Skip if both are known AND neither is suspiciously short (≤3 chars) - # Short known words like "ats", "th" may be OCR boundary errors - both_known = self._known(w1) and self._known(w2_raw) - both_long = len(w1) > 3 and len(w2_raw) > 3 - if both_known and both_long: - continue - # Try with punctuation first (for abbreviations like "sth.") + + # Try boundary repair — always, even if both words are valid. + # Use word-frequency scoring to decide if repair is better. repair = self._try_boundary_repair(w1, w2_with_punct) if not repair and w2_with_punct != w2_raw: repair = self._try_boundary_repair(w1, w2_raw) if repair: new_w1, new_w2_full = repair - # Quality gate: only accept if repair is actually better - # Better = at least one result is a known abbreviation, or - # both results are longer/more common than originals new_w2_base = new_w2_full.rstrip(".,;:!?") - old_score = (len(w1) >= 3) + (len(w2_raw) >= 3) - new_score = ( - (self._known(new_w1) or new_w1.lower() in _ABBREVS) - + (self._known(new_w2_base) or new_w2_base.lower() in _ABBREVS) - ) - # Accept if new pair scores higher, or if it includes an abbreviation + + # Frequency-based scoring: product of word frequencies + # Higher product = more common word pair = better + old_freq = self._word_freq(w1) * self._word_freq(w2_raw) + new_freq = self._word_freq(new_w1) * self._word_freq(new_w2_base) + + # Abbreviation bonus: if repair produces a known abbreviation, + # add a large frequency boost (abbreviations have zero frequency) has_abbrev = new_w1.lower() in _ABBREVS or new_w2_base.lower() in _ABBREVS - if new_score >= old_score or has_abbrev: + if has_abbrev: + new_freq = max(new_freq, old_freq * 10) + + # Accept if repair produces a more frequent word pair + # (threshold: at least 5x more frequent to avoid false positives) + if new_freq > old_freq * 5 or has_abbrev: new_w2_punct = new_w2_full[len(new_w2_base):] changes.append(f"{w1} {w2_raw}→{new_w1} {new_w2_base}") token_list[i][0] = new_w1 diff --git a/klausur-service/backend/tests/test_smart_spell.py b/klausur-service/backend/tests/test_smart_spell.py index 6f1d27e..deae241 100644 --- a/klausur-service/backend/tests/test_smart_spell.py +++ b/klausur-service/backend/tests/test_smart_spell.py @@ -166,8 +166,8 @@ class TestBoundaryRepair: result = sc.correct_text("be good ats th.", "en") assert "at sth." in result.corrected, f"Expected 'at sth.' in '{result.corrected}'" - def test_no_repair_if_both_known(self, sc): - """Don't repair if both words are already valid.""" + def test_no_repair_common_pair(self, sc): + """Don't repair if both words form a common pair.""" result = sc.correct_text("at the", "en") assert result.corrected == "at the" assert not result.changed @@ -184,6 +184,21 @@ class TestBoundaryRepair: assert repair[0] == "at" assert repair[1] == "sth." + def test_pound_sand_to_pounds_and(self, sc): + """'Pound sand' → 'Pounds and' — both valid but repair is much more frequent.""" + result = sc.correct_text("Pound sand euros", "en") + assert "Pounds and" in result.corrected, f"Expected 'Pounds and' in '{result.corrected}'" + + def test_wit_hit_to_with_it(self, sc): + """'wit hit' → 'with it' — frequency-based repair.""" + result = sc.correct_text("be careful wit hit", "en") + assert "with it" in result.corrected, f"Expected 'with it' in '{result.corrected}'" + + def test_done_euro_to_one_euro(self, sc): + """'done euro' → 'one euro' in context.""" + result = sc.correct_text("done euro", "en") + assert "one euro" in result.corrected, f"Expected 'one euro' in '{result.corrected}'" + # ─── Context Split ──────────────────────────────────────────────────────────