From cc5ee7492123c3f08f6190fd3c1d067196f618c8 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Thu, 19 Mar 2026 10:55:36 +0100
Subject: [PATCH] Use OCR-recognized IPA when word not in dictionary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For merged tokens like "schoolbagsku:lbæg", split at IPA marker
boundary instead of prefix-matching to a shorter dictionary word.
Result: "schoolbag [sku:lbæg]" instead of "school [skˈuːl]".

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_ocr_engines.py | 36 ++++++++++++++++++++---
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py
index 894b0fc..9c127e2 100644
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -1060,10 +1060,38 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
         # Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
         if not ipa and '-' in clean:
             ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
-        # Fallback: prefix matching for merged tokens where OCR joined
-        # headword with garbled IPA (e.g. "schoolbagsku:lbæg",
-        # "Scotland'skotland").  Find longest dictionary prefix.
-        # Use only alpha chars to avoid false matches on punctuation.
+        # Fallback 1: IPA-marker split for merged tokens where OCR
+        # joined headword with its IPA (e.g. "schoolbagsku:lbæg").
+        # Find the first IPA marker character (:, æ, ɪ, etc.), walk
+        # backwards ≤3 chars for the onset consonant cluster, and
+        # split into headword + OCR IPA.
+        _IPA_SPLIT_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
+        if not ipa:
+            first_marker = next(
+                (p for p, ch in enumerate(w) if ch in _IPA_SPLIT_CHARS), -1,
+            )
+            if first_marker >= 3:
+                split = first_marker
+                while (split > 0
+                       and split > first_marker - 4
+                       and w[split - 1].isalpha()
+                       and w[split - 1].islower()):
+                    split -= 1
+                if split >= 2:
+                    headword = w[:split]
+                    ocr_ipa = w[split:]
+                    hw_ipa = _lookup_ipa(headword, pronunciation)
+                    if hw_ipa:
+                        words[i] = f"{headword} [{hw_ipa}]"
+                    else:
+                        # Word not in dictionary — use OCR IPA
+                        words[i] = f"{headword} [{ocr_ipa}]"
+                    words = words[:i + 1]
+                    ipa = True  # signal that we handled it
+                    break
+        # Fallback 2: prefix matching for merged tokens WITHOUT IPA
+        # markers (e.g. "Scotland'skotland").  Find longest dictionary
+        # prefix using only alpha chars to avoid punctuation matches.
         if not ipa:
             alpha = re.sub(r'[^a-zA-Z]', '', clean)
             if len(alpha) > 5:  # need at least 6 chars for meaningful split