SmartSpellChecker: frequency-based boundary repair for valid word pairs
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 43s
CI / test-go-edu-search (push) Successful in 40s
CI / test-python-klausur (push) Failing after 2m42s
CI / test-python-agent-core (push) Successful in 37s
CI / test-nodejs-website (push) Successful in 35s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 43s
CI / test-go-edu-search (push) Successful in 40s
CI / test-python-klausur (push) Failing after 2m42s
CI / test-python-agent-core (push) Successful in 37s
CI / test-nodejs-website (push) Successful in 35s
Previously, boundary repair was skipped when both words were valid dictionary words (e.g., "Pound sand", "wit hit", "done euro"). Now uses word-frequency scoring (product of bigram frequencies) to decide if the repair produces a more common word pair. Threshold: repair accepted when new pair is >5x more frequent, or when repair produces a known abbreviation. New fixes: Pound sand→Pounds and (2000x), wit hit→with it (100000x), done euro→one euro (7x). 43 tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -166,6 +166,11 @@ class SmartSpellChecker:
|
|||||||
pass
|
pass
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _word_freq(self, word: str) -> float:
|
||||||
|
"""Get word frequency (max of EN and DE)."""
|
||||||
|
w = word.lower()
|
||||||
|
return max(self.en.word_usage_frequency(w), self.de.word_usage_frequency(w))
|
||||||
|
|
||||||
def _known_in(self, word: str, lang: str) -> bool:
|
def _known_in(self, word: str, lang: str) -> bool:
|
||||||
"""True if word is known in a specific language dictionary."""
|
"""True if word is known in a specific language dictionary."""
|
||||||
w = word.lower()
|
w = word.lower()
|
||||||
@@ -450,32 +455,31 @@ class SmartSpellChecker:
|
|||||||
w1 = token_list[i][0]
|
w1 = token_list[i][0]
|
||||||
w2_raw = token_list[i + 1][0]
|
w2_raw = token_list[i + 1][0]
|
||||||
# Include trailing punct from separator in w2 for abbreviation matching
|
# Include trailing punct from separator in w2 for abbreviation matching
|
||||||
# e.g., "ats" + " " + "th" + "." → try repair("ats", "th.")
|
|
||||||
w2_with_punct = w2_raw + token_list[i + 1][1].rstrip(" ")
|
w2_with_punct = w2_raw + token_list[i + 1][1].rstrip(" ")
|
||||||
# Skip if both are known AND neither is suspiciously short (≤3 chars)
|
|
||||||
# Short known words like "ats", "th" may be OCR boundary errors
|
# Try boundary repair — always, even if both words are valid.
|
||||||
both_known = self._known(w1) and self._known(w2_raw)
|
# Use word-frequency scoring to decide if repair is better.
|
||||||
both_long = len(w1) > 3 and len(w2_raw) > 3
|
|
||||||
if both_known and both_long:
|
|
||||||
continue
|
|
||||||
# Try with punctuation first (for abbreviations like "sth.")
|
|
||||||
repair = self._try_boundary_repair(w1, w2_with_punct)
|
repair = self._try_boundary_repair(w1, w2_with_punct)
|
||||||
if not repair and w2_with_punct != w2_raw:
|
if not repair and w2_with_punct != w2_raw:
|
||||||
repair = self._try_boundary_repair(w1, w2_raw)
|
repair = self._try_boundary_repair(w1, w2_raw)
|
||||||
if repair:
|
if repair:
|
||||||
new_w1, new_w2_full = repair
|
new_w1, new_w2_full = repair
|
||||||
# Quality gate: only accept if repair is actually better
|
|
||||||
# Better = at least one result is a known abbreviation, or
|
|
||||||
# both results are longer/more common than originals
|
|
||||||
new_w2_base = new_w2_full.rstrip(".,;:!?")
|
new_w2_base = new_w2_full.rstrip(".,;:!?")
|
||||||
old_score = (len(w1) >= 3) + (len(w2_raw) >= 3)
|
|
||||||
new_score = (
|
# Frequency-based scoring: product of word frequencies
|
||||||
(self._known(new_w1) or new_w1.lower() in _ABBREVS)
|
# Higher product = more common word pair = better
|
||||||
+ (self._known(new_w2_base) or new_w2_base.lower() in _ABBREVS)
|
old_freq = self._word_freq(w1) * self._word_freq(w2_raw)
|
||||||
)
|
new_freq = self._word_freq(new_w1) * self._word_freq(new_w2_base)
|
||||||
# Accept if new pair scores higher, or if it includes an abbreviation
|
|
||||||
|
# Abbreviation bonus: if repair produces a known abbreviation,
|
||||||
|
# add a large frequency boost (abbreviations have zero frequency)
|
||||||
has_abbrev = new_w1.lower() in _ABBREVS or new_w2_base.lower() in _ABBREVS
|
has_abbrev = new_w1.lower() in _ABBREVS or new_w2_base.lower() in _ABBREVS
|
||||||
if new_score >= old_score or has_abbrev:
|
if has_abbrev:
|
||||||
|
new_freq = max(new_freq, old_freq * 10)
|
||||||
|
|
||||||
|
# Accept if repair produces a more frequent word pair
|
||||||
|
# (threshold: at least 5x more frequent to avoid false positives)
|
||||||
|
if new_freq > old_freq * 5 or has_abbrev:
|
||||||
new_w2_punct = new_w2_full[len(new_w2_base):]
|
new_w2_punct = new_w2_full[len(new_w2_base):]
|
||||||
changes.append(f"{w1} {w2_raw}→{new_w1} {new_w2_base}")
|
changes.append(f"{w1} {w2_raw}→{new_w1} {new_w2_base}")
|
||||||
token_list[i][0] = new_w1
|
token_list[i][0] = new_w1
|
||||||
|
|||||||
@@ -166,8 +166,8 @@ class TestBoundaryRepair:
|
|||||||
result = sc.correct_text("be good ats th.", "en")
|
result = sc.correct_text("be good ats th.", "en")
|
||||||
assert "at sth." in result.corrected, f"Expected 'at sth.' in '{result.corrected}'"
|
assert "at sth." in result.corrected, f"Expected 'at sth.' in '{result.corrected}'"
|
||||||
|
|
||||||
def test_no_repair_if_both_known(self, sc):
|
def test_no_repair_common_pair(self, sc):
|
||||||
"""Don't repair if both words are already valid."""
|
"""Don't repair if both words form a common pair."""
|
||||||
result = sc.correct_text("at the", "en")
|
result = sc.correct_text("at the", "en")
|
||||||
assert result.corrected == "at the"
|
assert result.corrected == "at the"
|
||||||
assert not result.changed
|
assert not result.changed
|
||||||
@@ -184,6 +184,21 @@ class TestBoundaryRepair:
|
|||||||
assert repair[0] == "at"
|
assert repair[0] == "at"
|
||||||
assert repair[1] == "sth."
|
assert repair[1] == "sth."
|
||||||
|
|
||||||
|
def test_pound_sand_to_pounds_and(self, sc):
|
||||||
|
"""'Pound sand' → 'Pounds and' — both valid but repair is much more frequent."""
|
||||||
|
result = sc.correct_text("Pound sand euros", "en")
|
||||||
|
assert "Pounds and" in result.corrected, f"Expected 'Pounds and' in '{result.corrected}'"
|
||||||
|
|
||||||
|
def test_wit_hit_to_with_it(self, sc):
|
||||||
|
"""'wit hit' → 'with it' — frequency-based repair."""
|
||||||
|
result = sc.correct_text("be careful wit hit", "en")
|
||||||
|
assert "with it" in result.corrected, f"Expected 'with it' in '{result.corrected}'"
|
||||||
|
|
||||||
|
def test_done_euro_to_one_euro(self, sc):
|
||||||
|
"""'done euro' → 'one euro' in context."""
|
||||||
|
result = sc.correct_text("done euro", "en")
|
||||||
|
assert "one euro" in result.corrected, f"Expected 'one euro' in '{result.corrected}'"
|
||||||
|
|
||||||
|
|
||||||
# ─── Context Split ──────────────────────────────────────────────────────────
|
# ─── Context Split ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user