From b98ea33a3a1a10ee2e5d679951d01f47c161fb12 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 18 Mar 2026 11:15:14 +0100 Subject: [PATCH] Strip garbled OCR phonetics after IPA insertion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _insert_missing_ipa now removes garbled phonetic text (e.g. "skea", "sku:l", "'sizaz") that follows the inserted IPA bracket. Keeps delimiters (–, -), uppercase words (German), and known English words. Fixes: "scare [skˈɛə] skea" → "scare [skˈɛə]" Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_ocr_engines.py | 26 +++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index e5a5a51..51d4d54 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -1028,8 +1028,30 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: ipa = _lookup_ipa(clean, pronunciation) if ipa: words[i] = f"{w} [{ipa}]" - # Only insert for the FIRST word that has IPA - # (headword in English column) + # Strip garbled OCR phonetics after the IPA bracket. + # On scanned vocab pages, printed IPA is read as garbled + # text (e.g. "scare skea" where "skea" is garbled /skɛə/). + # After inserting correct IPA, remove remaining words that + # aren't real English words, delimiters, or German text. + kept = words[:i + 1] + for j in range(i + 1, len(words)): + wj = words[j] + # Delimiter — keep this and everything after + if wj in ('–', '—', '-', '/', '|', ',', ';'): + kept.extend(words[j:]) + break + # Starts with uppercase — likely German or proper noun + clean_j = re.sub(r'[^a-zA-Z]', '', wj) + if clean_j and clean_j[0].isupper(): + kept.extend(words[j:]) + break + # Known English word (≥2 chars) — keep it and rest + if clean_j and len(clean_j) >= 2: + if _lookup_ipa(clean_j, pronunciation): + kept.extend(words[j:]) + break + # Otherwise — likely garbled phonetics, skip + words = kept break return ' '.join(words)