SmartSpellChecker: frequency scoring, IPA protection, slash→l fix
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 42s
CI / test-python-klausur (push) Failing after 2m55s
CI / test-python-agent-core (push) Successful in 37s
CI / test-nodejs-website (push) Successful in 31s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 42s
CI / test-python-klausur (push) Failing after 2m55s
CI / test-python-agent-core (push) Successful in 37s
CI / test-nodejs-website (push) Successful in 31s
Major improvements: - Frequency-based boundary repair: always tries repair, uses word frequency product to decide (Pound sand→Pounds and: 2000x better) - IPA bracket protection: words inside [brackets] are never modified, even when brackets land in tokenizer separators - Slash→l substitution: "p/" → "pl" for italic l misread as slash - Abbreviation guard uses rare-word threshold (freq < 1e-6) instead of binary known/unknown — prevents "Can I" → "Ca nI" while still fixing "ats th." → "at sth." - Tokenizer includes / character for slash-word detection 43 tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -70,6 +70,7 @@ _DIGIT_SUBS: Dict[str, List[str]] = {
|
|||||||
'6': ['g', 'G'],
|
'6': ['g', 'G'],
|
||||||
'8': ['b', 'B'],
|
'8': ['b', 'B'],
|
||||||
'|': ['I', 'l'],
|
'|': ['I', 'l'],
|
||||||
|
'/': ['l'], # italic 'l' misread as slash (e.g. "p/" → "pl")
|
||||||
}
|
}
|
||||||
_SUSPICIOUS_CHARS = frozenset(_DIGIT_SUBS.keys())
|
_SUSPICIOUS_CHARS = frozenset(_DIGIT_SUBS.keys())
|
||||||
|
|
||||||
@@ -79,8 +80,8 @@ _UMLAUT_MAP = {
|
|||||||
'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü',
|
'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü',
|
||||||
}
|
}
|
||||||
|
|
||||||
# Tokenizer
|
# Tokenizer — includes | and / so OCR artifacts like "p/" are treated as words
|
||||||
_TOKEN_RE = re.compile(r"([A-Za-zÄÖÜäöüß'|]+)([^A-Za-zÄÖÜäöüß'|]*)")
|
_TOKEN_RE = re.compile(r"([A-Za-zÄÖÜäöüß'|/]+)([^A-Za-zÄÖÜäöüß'|/]*)")
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -196,6 +197,10 @@ class SmartSpellChecker:
|
|||||||
if word.isdigit() or '.' in word:
|
if word.isdigit() or '.' in word:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Skip IPA/phonetic content in brackets
|
||||||
|
if '[' in word or ']' in word:
|
||||||
|
return None
|
||||||
|
|
||||||
has_suspicious = any(ch in _SUSPICIOUS_CHARS for ch in word)
|
has_suspicious = any(ch in _SUSPICIOUS_CHARS for ch in word)
|
||||||
|
|
||||||
# 1. Already known → no fix
|
# 1. Already known → no fix
|
||||||
@@ -454,6 +459,22 @@ class SmartSpellChecker:
|
|||||||
for i in range(len(token_list) - 1):
|
for i in range(len(token_list) - 1):
|
||||||
w1 = token_list[i][0]
|
w1 = token_list[i][0]
|
||||||
w2_raw = token_list[i + 1][0]
|
w2_raw = token_list[i + 1][0]
|
||||||
|
|
||||||
|
# Skip boundary repair for IPA/bracket content
|
||||||
|
# Brackets may be in the token OR in the adjacent separators
|
||||||
|
sep_before_w1 = token_list[i - 1][1] if i > 0 else ""
|
||||||
|
sep_after_w1 = token_list[i][1]
|
||||||
|
sep_after_w2 = token_list[i + 1][1]
|
||||||
|
has_bracket = (
|
||||||
|
'[' in w1 or ']' in w1 or '[' in w2_raw or ']' in w2_raw
|
||||||
|
or ']' in sep_after_w1 # w1 text was inside [brackets]
|
||||||
|
or '[' in sep_after_w1 # w2 starts a bracket
|
||||||
|
or ']' in sep_after_w2 # w2 text was inside [brackets]
|
||||||
|
or '[' in sep_before_w1 # w1 starts a bracket
|
||||||
|
)
|
||||||
|
if has_bracket:
|
||||||
|
continue
|
||||||
|
|
||||||
# Include trailing punct from separator in w2 for abbreviation matching
|
# Include trailing punct from separator in w2 for abbreviation matching
|
||||||
w2_with_punct = w2_raw + token_list[i + 1][1].rstrip(" ")
|
w2_with_punct = w2_raw + token_list[i + 1][1].rstrip(" ")
|
||||||
|
|
||||||
@@ -471,15 +492,26 @@ class SmartSpellChecker:
|
|||||||
old_freq = self._word_freq(w1) * self._word_freq(w2_raw)
|
old_freq = self._word_freq(w1) * self._word_freq(w2_raw)
|
||||||
new_freq = self._word_freq(new_w1) * self._word_freq(new_w2_base)
|
new_freq = self._word_freq(new_w1) * self._word_freq(new_w2_base)
|
||||||
|
|
||||||
# Abbreviation bonus: if repair produces a known abbreviation,
|
# Abbreviation bonus: if repair produces a known abbreviation
|
||||||
# add a large frequency boost (abbreviations have zero frequency)
|
|
||||||
has_abbrev = new_w1.lower() in _ABBREVS or new_w2_base.lower() in _ABBREVS
|
has_abbrev = new_w1.lower() in _ABBREVS or new_w2_base.lower() in _ABBREVS
|
||||||
if has_abbrev:
|
if has_abbrev:
|
||||||
|
# Accept abbreviation repair ONLY if at least one of the
|
||||||
|
# original words is rare/unknown (prevents "Can I" → "Ca nI"
|
||||||
|
# where both original words are common and correct).
|
||||||
|
# "Rare" = frequency < 1e-6 (covers "ats", "th" but not "Can", "I")
|
||||||
|
RARE_THRESHOLD = 1e-6
|
||||||
|
orig_both_common = (
|
||||||
|
self._word_freq(w1) > RARE_THRESHOLD
|
||||||
|
and self._word_freq(w2_raw) > RARE_THRESHOLD
|
||||||
|
)
|
||||||
|
if not orig_both_common:
|
||||||
new_freq = max(new_freq, old_freq * 10)
|
new_freq = max(new_freq, old_freq * 10)
|
||||||
|
else:
|
||||||
|
has_abbrev = False # both originals common → don't trust
|
||||||
|
|
||||||
# Accept if repair produces a more frequent word pair
|
# Accept if repair produces a more frequent word pair
|
||||||
# (threshold: at least 5x more frequent to avoid false positives)
|
# (threshold: at least 5x more frequent to avoid false positives)
|
||||||
if new_freq > old_freq * 5 or has_abbrev:
|
if new_freq > old_freq * 5:
|
||||||
new_w2_punct = new_w2_full[len(new_w2_base):]
|
new_w2_punct = new_w2_full[len(new_w2_base):]
|
||||||
changes.append(f"{w1} {w2_raw}→{new_w1} {new_w2_base}")
|
changes.append(f"{w1} {w2_raw}→{new_w1} {new_w2_base}")
|
||||||
token_list[i][0] = new_w1
|
token_list[i][0] = new_w1
|
||||||
@@ -503,6 +535,13 @@ class SmartSpellChecker:
|
|||||||
# --- Pass 3: Per-word correction ---
|
# --- Pass 3: Per-word correction ---
|
||||||
parts: List[str] = []
|
parts: List[str] = []
|
||||||
for i, (word, sep) in enumerate(token_list):
|
for i, (word, sep) in enumerate(token_list):
|
||||||
|
# Skip words inside IPA brackets (brackets land in separators)
|
||||||
|
prev_sep = token_list[i - 1][1] if i > 0 else ""
|
||||||
|
if '[' in prev_sep or ']' in sep:
|
||||||
|
parts.append(word)
|
||||||
|
parts.append(sep)
|
||||||
|
continue
|
||||||
|
|
||||||
next_word = token_list[i + 1][0] if i + 1 < len(token_list) else ""
|
next_word = token_list[i + 1][0] if i + 1 < len(token_list) else ""
|
||||||
prev_word = token_list[i - 1][0] if i > 0 else ""
|
prev_word = token_list[i - 1][0] if i > 0 else ""
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user