From 21d37b5da1e987f439dcfae7ef126a6da8baa885 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Thu, 19 Mar 2026 10:40:37 +0100
Subject: [PATCH] Fix prefix matching: use alpha-only chars, min 4-char prefix

Prevents false positives where punctuation (apostrophes) in merged
tokens caused wrong dictionary matches (e.g. "'se" from "'sekandarr"
matching as a word, breaking IPA continuation row fix).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_ocr_engines.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py
index 2647922..894b0fc 100644
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -1063,16 +1063,18 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
         # Fallback: prefix matching for merged tokens where OCR joined
         # headword with garbled IPA (e.g. "schoolbagsku:lbæg",
         # "Scotland'skotland").  Find longest dictionary prefix.
-        if not ipa and len(clean) > 4:
-            for end in range(len(clean) - 1, 2, -1):
-                prefix = clean[:end]
-                test_ipa = _lookup_ipa(prefix, pronunciation)
-                if test_ipa:
-                    ipa = test_ipa
-                    # Replace token with just the headword prefix
-                    w = prefix
-                    words[i] = prefix
-                    break
+        # Use only alpha chars to avoid false matches on punctuation.
+        if not ipa:
+            alpha = re.sub(r'[^a-zA-Z]', '', clean)
+            if len(alpha) > 5:  # need at least 6 chars for meaningful split
+                for end in range(len(alpha), 3, -1):  # min prefix 4 chars
+                    prefix = alpha[:end]
+                    test_ipa = _lookup_ipa(prefix, pronunciation)
+                    if test_ipa:
+                        ipa = test_ipa
+                        w = prefix
+                        words[i] = prefix
+                        break
         if ipa:
             words[i] = f"{w} [{ipa}]"
             # Strip garbled OCR phonetics after the IPA bracket.