From cc5ee7492123c3f08f6190fd3c1d067196f618c8 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 19 Mar 2026 10:55:36 +0100 Subject: [PATCH] Use OCR-recognized IPA when word not in dictionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For merged tokens like "schoolbagsku:lbæg", split at IPA marker boundary instead of prefix-matching to a shorter dictionary word. Result: "schoolbag [sku:lbæg]" instead of "school [skˈuːl]". Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_ocr_engines.py | 36 ++++++++++++++++++++--- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index 894b0fc..9c127e2 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -1060,10 +1060,38 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: # Fallback: try without hyphens (e.g. "second-hand" → "secondhand") if not ipa and '-' in clean: ipa = _lookup_ipa(clean.replace('-', ''), pronunciation) - # Fallback: prefix matching for merged tokens where OCR joined - # headword with garbled IPA (e.g. "schoolbagsku:lbæg", - # "Scotland'skotland"). Find longest dictionary prefix. - # Use only alpha chars to avoid false matches on punctuation. + # Fallback 1: IPA-marker split for merged tokens where OCR + # joined headword with its IPA (e.g. "schoolbagsku:lbæg"). + # Find the first IPA marker character (:, æ, ɪ, etc.), walk + # backwards ≤3 chars for the onset consonant cluster, and + # split into headword + OCR IPA. + _IPA_SPLIT_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ') + if not ipa: + first_marker = next( + (p for p, ch in enumerate(w) if ch in _IPA_SPLIT_CHARS), -1, + ) + if first_marker >= 3: + split = first_marker + while (split > 0 + and split > first_marker - 4 + and w[split - 1].isalpha() + and w[split - 1].islower()): + split -= 1 + if split >= 2: + headword = w[:split] + ocr_ipa = w[split:] + hw_ipa = _lookup_ipa(headword, pronunciation) + if hw_ipa: + words[i] = f"{headword} [{hw_ipa}]" + else: + # Word not in dictionary — use OCR IPA + words[i] = f"{headword} [{ocr_ipa}]" + words = words[:i + 1] + ipa = True # signal that we handled it + break + # Fallback 2: prefix matching for merged tokens WITHOUT IPA + # markers (e.g. "Scotland'skotland"). Find longest dictionary + # prefix using only alpha chars to avoid punctuation matches. if not ipa: alpha = re.sub(r'[^a-zA-Z]', '', clean) if len(alpha) > 5: # need at least 6 chars for meaningful split