From 19cbbf310a55716de82281e9712f0629dcd07af8 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 19 Mar 2026 10:36:25 +0100 Subject: [PATCH] Improve garbled IPA cleanup: trailing strip, prefix match, broader guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Strip trailing garbled IPA after proper [IPA] brackets (e.g. "sea [sˈiː] si:" → "sea [sˈiː]") 2. Add prefix matching for merged tokens where OCR joined headword with garbled IPA (e.g. "schoolbagsku:lbæg" → "schoolbag [skˈuːlbæɡ]") 3. Broaden guard to also trigger on trailing non-dictionary words (e.g. "scare skea" → "scare [skˈɛə]") Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_ocr_engines.py | 113 ++++++++++++++++++++-- 1 file changed, 103 insertions(+), 10 deletions(-) diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index 397097b..2647922 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -1022,11 +1022,6 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: (e.g. "scare skea" where "skea" is garbled /skɛə/). This scans the text for the headword, inserts correct [IPA], and strips the garbled fragments. - IMPORTANT: This function must only be called when ``_text_has_garbled_ipa`` - confirms that the text actually contains garbled phonetics. If the text - is clean (e.g. just "scissors"), IPA must NOT be inserted — the original - page had no phonetics on that line. - Only inserts for words that: - are standalone (not already followed by a bracket) - have an IPA entry in the dictionary @@ -1065,6 +1060,19 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: # Fallback: try without hyphens (e.g. "second-hand" → "secondhand") if not ipa and '-' in clean: ipa = _lookup_ipa(clean.replace('-', ''), pronunciation) + # Fallback: prefix matching for merged tokens where OCR joined + # headword with garbled IPA (e.g. "schoolbagsku:lbæg", + # "Scotland'skotland"). Find longest dictionary prefix. + if not ipa and len(clean) > 4: + for end in range(len(clean) - 1, 2, -1): + prefix = clean[:end] + test_ipa = _lookup_ipa(prefix, pronunciation) + if test_ipa: + ipa = test_ipa + # Replace token with just the headword prefix + w = prefix + words[i] = prefix + break if ipa: words[i] = f"{w} [{ipa}]" # Strip garbled OCR phonetics after the IPA bracket. @@ -1096,6 +1104,87 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: return ' '.join(words) +def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool: + """Check if text has a headword followed by non-dictionary trailing words. + + Used as an additional trigger for ``_insert_missing_ipa`` when + ``_text_has_garbled_ipa`` returns False because the garbled IPA + happens to look like plain ASCII (e.g. "skea" for /skɛə/). + """ + if not IPA_AVAILABLE: + return False + words = text.strip().split() + if len(words) < 2 or len(words) > 6: + return False + # Find first dictionary word + hw_idx = -1 + for i, w in enumerate(words): + clean = re.sub(r'[^a-zA-Z\'-]', '', w) + if not clean or len(clean) < 2: + continue + if clean.lower() in _GRAMMAR_BRACKET_WORDS: + continue + if _lookup_ipa(clean, pronunciation): + hw_idx = i + break + if hw_idx < 0 or hw_idx >= len(words) - 1: + return False + # Check ALL remaining words — if none are dictionary/delimiter/German, + # they are likely garbled IPA. + for j in range(hw_idx + 1, len(words)): + wj = words[j] + if wj in ('–', '—', '-', '/', '|', ',', ';'): + return False + clean_j = re.sub(r'[^a-zA-Z]', '', wj) + if clean_j and clean_j[0].isupper(): + return False + if clean_j and len(clean_j) >= 2 and _lookup_ipa(clean_j, pronunciation): + return False + return True + + +def _strip_post_bracket_garbled( + text: str, pronunciation: str = 'british', +) -> str: + """Strip garbled IPA fragments that trail after proper [IPA] brackets. + + E.g. ``sea [sˈiː] si:`` → ``sea [sˈiː]`` + ``seat [sˈiːt] si:t`` → ``seat [sˈiːt]`` + """ + if ']' not in text: + return text + last_bracket = text.rfind(']') + if last_bracket >= len(text) - 1: + return text + before = text[:last_bracket + 1].rstrip() + after = text[last_bracket + 1:].strip() + if not after: + return text + after_words = after.split() + kept: List[str] = [] + for idx, w in enumerate(after_words): + # Delimiter — keep rest + if w in ('–', '—', '-', '/', '|', ',', ';'): + kept.extend(after_words[idx:]) + break + # Contains IPA markers (length mark, IPA chars) — garbled, skip + if ':' in w or any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋˈˌ'): + continue + clean = re.sub(r'[^a-zA-Z]', '', w) + # Uppercase — likely German, keep rest + if clean and clean[0].isupper(): + kept.extend(after_words[idx:]) + break + # Known English word — keep rest + if clean and len(clean) >= 2 and _lookup_ipa(clean, pronunciation): + kept.extend(after_words[idx:]) + break + # Unknown short word — likely garbled, skip + if kept: + return before + ' ' + ' '.join(kept) + return before + + def fix_ipa_continuation_cell( garbled_text: str, headword_text: str, @@ -1242,11 +1331,15 @@ def fix_cell_phonetics( if col_type == 'column_en': # Full processing: replace garbled IPA, strip orphan brackets. new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True) - if new_text == text and _text_has_garbled_ipa(text): - # Only insert IPA when there IS garbled phonetics in the - # text — never add IPA to clean text that had none on the - # original page. - new_text = _insert_missing_ipa(text, pronunciation) + if new_text == text: + # Insert IPA when garbled phonetics exist OR when trailing + # non-dictionary words suggest garbled IPA in plain ASCII. + if _text_has_garbled_ipa(text) or _has_non_dict_trailing(text, pronunciation): + new_text = _insert_missing_ipa(text, pronunciation) + # Strip trailing garbled fragments after proper [IPA] brackets + # (e.g. "sea [sˈiː] si:" → "sea [sˈiː]") + if ']' in new_text: + new_text = _strip_post_bracket_garbled(new_text, pronunciation) else: # column_text: replace garbled IPA, no orphan stripping new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)