diff --git a/klausur-service/backend/smart_spell.py b/klausur-service/backend/smart_spell.py index 66f1f0b..77fe7e8 100644 --- a/klausur-service/backend/smart_spell.py +++ b/klausur-service/backend/smart_spell.py @@ -70,6 +70,7 @@ _DIGIT_SUBS: Dict[str, List[str]] = { '6': ['g', 'G'], '8': ['b', 'B'], '|': ['I', 'l'], + '/': ['l'], # italic 'l' misread as slash (e.g. "p/" → "pl") } _SUSPICIOUS_CHARS = frozenset(_DIGIT_SUBS.keys()) @@ -79,8 +80,8 @@ _UMLAUT_MAP = { 'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü', } -# Tokenizer -_TOKEN_RE = re.compile(r"([A-Za-zÄÖÜäöüß'|]+)([^A-Za-zÄÖÜäöüß'|]*)") +# Tokenizer — includes | and / so OCR artifacts like "p/" are treated as words +_TOKEN_RE = re.compile(r"([A-Za-zÄÖÜäöüß'|/]+)([^A-Za-zÄÖÜäöüß'|/]*)") # --------------------------------------------------------------------------- @@ -196,6 +197,10 @@ class SmartSpellChecker: if word.isdigit() or '.' in word: return None + # Skip IPA/phonetic content in brackets + if '[' in word or ']' in word: + return None + has_suspicious = any(ch in _SUSPICIOUS_CHARS for ch in word) # 1. Already known → no fix @@ -454,6 +459,22 @@ class SmartSpellChecker: for i in range(len(token_list) - 1): w1 = token_list[i][0] w2_raw = token_list[i + 1][0] + + # Skip boundary repair for IPA/bracket content + # Brackets may be in the token OR in the adjacent separators + sep_before_w1 = token_list[i - 1][1] if i > 0 else "" + sep_after_w1 = token_list[i][1] + sep_after_w2 = token_list[i + 1][1] + has_bracket = ( + '[' in w1 or ']' in w1 or '[' in w2_raw or ']' in w2_raw + or ']' in sep_after_w1 # w1 text was inside [brackets] + or '[' in sep_after_w1 # w2 starts a bracket + or ']' in sep_after_w2 # w2 text was inside [brackets] + or '[' in sep_before_w1 # w1 starts a bracket + ) + if has_bracket: + continue + # Include trailing punct from separator in w2 for abbreviation matching w2_with_punct = w2_raw + token_list[i + 1][1].rstrip(" ") @@ -471,15 +492,26 @@ class SmartSpellChecker: old_freq = self._word_freq(w1) * self._word_freq(w2_raw) new_freq = self._word_freq(new_w1) * self._word_freq(new_w2_base) - # Abbreviation bonus: if repair produces a known abbreviation, - # add a large frequency boost (abbreviations have zero frequency) + # Abbreviation bonus: if repair produces a known abbreviation has_abbrev = new_w1.lower() in _ABBREVS or new_w2_base.lower() in _ABBREVS if has_abbrev: - new_freq = max(new_freq, old_freq * 10) + # Accept abbreviation repair ONLY if at least one of the + # original words is rare/unknown (prevents "Can I" → "Ca nI" + # where both original words are common and correct). + # "Rare" = frequency < 1e-6 (covers "ats", "th" but not "Can", "I") + RARE_THRESHOLD = 1e-6 + orig_both_common = ( + self._word_freq(w1) > RARE_THRESHOLD + and self._word_freq(w2_raw) > RARE_THRESHOLD + ) + if not orig_both_common: + new_freq = max(new_freq, old_freq * 10) + else: + has_abbrev = False # both originals common → don't trust # Accept if repair produces a more frequent word pair # (threshold: at least 5x more frequent to avoid false positives) - if new_freq > old_freq * 5 or has_abbrev: + if new_freq > old_freq * 5: new_w2_punct = new_w2_full[len(new_w2_base):] changes.append(f"{w1} {w2_raw}→{new_w1} {new_w2_base}") token_list[i][0] = new_w1 @@ -503,6 +535,13 @@ class SmartSpellChecker: # --- Pass 3: Per-word correction --- parts: List[str] = [] for i, (word, sep) in enumerate(token_list): + # Skip words inside IPA brackets (brackets land in separators) + prev_sep = token_list[i - 1][1] if i > 0 else "" + if '[' in prev_sep or ']' in sep: + parts.append(word) + parts.append(sep) + continue + next_word = token_list[i + 1][0] if i + 1 < len(token_list) else "" prev_word = token_list[i - 1][0] if i > 0 else ""